create Django web app with MySQL

read https://docs.djangoproject.com/en/3.0/intro/tutorial01/, you can start a new project and app:
1. django-admin startproject mysite
2. python manage.py runserver
3. python manage.py startapp polls
4. edit polls/views.py and urls.py
5. edit mysite/urls.py and settings.py to include MySQL db info:

DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.mysql',
        'NAME': 'EmployeeDB',
        'USER': 'monty',
        'PASSWORD': 'somIUpass#98',
        'HOST': '192.168.0.28'
    }
}

polls/models.py:
from django.db import models
class Question(models.Model):
    question_text = models.CharField(max_length=200)
    pub_date = models.DateTimeField('date published')
class Choice(models.Model):
    question = models.ForeignKey(Question, on_delete=models.CASCADE)
    choice_text = models.CharField(max_length=200)
    votes = models.IntegerField(default=0)
PS C:\Users\zhuby\mysite>  python manage.py migrate
Operations to perform:
  Apply all migrations: admin, auth, contenttypes, sessions
Running migrations:
  Applying contenttypes.0001_initial... OK
  Applying auth.0001_initial... OK
  Applying admin.0001_initial... OK
  Applying admin.0002_logentry_remove_auto_add... OK
  Applying admin.0003_logentry_add_action_flag_choices... OK
  Applying auth.0002_alter_permission_name_max_length... OK
  Applying auth.0003_alter_user_email_max_length... OK
  Applying auth.0004_alter_user_username_opts... OK
  Applying auth.0005_alter_user_last_login_null... OK
  Applying auth.0007_alter_validators_add_error_messages... OK
  Applying auth.0008_alter_user_username_max_length... OK
  Applying auth.0009_alter_user_last_name_max_length... OK
  Applying auth.0010_alter_group_name_max_length... OK
  Applying auth.0011_update_proxy_permissions... OK
  Applying sessions.0001_initial... OK
PS C:\Users\zhuby\mysite> python manage.py makemigrations polls
Migrations for 'polls':
  polls\migrations\0001_initial.py
    - Create model Choice
PS C:\Users\zhuby\mysite> python manage.py sqlmigrate polls 0001
--
-- Create model Question
CREATE TABLE `polls_question` (`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, `question_text` varchar(200) NOT NULL, `pub_date` datetime(6) NOT NULL);
--
-- Create model Choice
--
CREATE TABLE `polls_choice` (`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY, `choice_text` varchar(200) NOT NULL, `votes` integer NOT NULL, `question_id` integer NOT NULL);
ALTER TABLE `polls_choice` ADD CONSTRAINT `polls_choice_question_id_c5b4b260_fk_polls_question_id` FOREIGN KEY (`question_id`) REFERENCES `polls_question` (`id`);
PS C:\Users\zhuby\mysite> python manage.py migrate
Operations to perform:
  Apply all migrations: admin, auth, contenttypes, polls, sessions
Running migrations:
  Applying polls.0001_initial... OK

You can check the code from https://github.com/zhuby1973/python/tree/master/mysite with "git checkout part2"

few examples from matplotlib

# sphinx_gallery_thumbnail_number = 2
import numpy as np
import matplotlib.pyplot as plt
N = 21
x = np.linspace(0, 10, 11)
y = [3.9, 4.4, 10.8, 10.3, 11.2, 13.1, 14.1,  9.9, 13.9, 15.1, 12.5]

# fit a linear curve an estimate its y-values and their error.
a, b = np.polyfit(x, y, deg=1)
y_est = a * x + b
y_err = x.std() * np.sqrt(1/len(x) +
                          (x - x.mean())**2 / np.sum((x - x.mean())**2))

fig, ax = plt.subplots()
ax.plot(x, y_est, '-')
ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2)
ax.plot(x, y, 'o', color='tab:brown')
plt.savefig('confidence.jpg')
plt.show()
"""
============
MRI With EEG
============

Displays a set of subplots with an MRI image, its intensity
histogram and some EEG traces.
"""

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.cm as cm

from matplotlib.collections import LineCollection
from matplotlib.ticker import MultipleLocator

fig = plt.figure("MRI_with_EEG")

# Load the MRI data (256x256 16 bit integers)
with cbook.get_sample_data('s1045.ima.gz') as dfile:
    im = np.frombuffer(dfile.read(), np.uint16).reshape((256, 256))

# Plot the MRI image
ax0 = fig.add_subplot(2, 2, 1)
ax0.imshow(im, cmap=cm.gray)
ax0.axis('off')

# Plot the histogram of MRI intensity
ax1 = fig.add_subplot(2, 2, 2)
im = np.ravel(im)
im = im[np.nonzero(im)]  # Ignore the background
im = im / (2**16 - 1)  # Normalize
ax1.hist(im, bins=100)
ax1.xaxis.set_major_locator(MultipleLocator(0.4))
ax1.minorticks_on()
ax1.set_yticks([])
ax1.set_xlabel('Intensity (a.u.)')
ax1.set_ylabel('MRI density')

# Load the EEG data
n_samples, n_rows = 800, 4
with cbook.get_sample_data('eeg.dat') as eegfile:
    data = np.fromfile(eegfile, dtype=float).reshape((n_samples, n_rows))
t = 10 * np.arange(n_samples) / n_samples

# Plot the EEG
ticklocs = []
ax2 = fig.add_subplot(2, 1, 2)
ax2.set_xlim(0, 10)
ax2.set_xticks(np.arange(10))
dmin = data.min()
dmax = data.max()
dr = (dmax - dmin) * 0.7  # Crowd them a bit.
y0 = dmin
y1 = (n_rows - 1) * dr + dmax
ax2.set_ylim(y0, y1)

segs = []
for i in range(n_rows):
    segs.append(np.column_stack((t, data[:, i])))
    ticklocs.append(i * dr)

offsets = np.zeros((n_rows, 2), dtype=float)
offsets[:, 1] = ticklocs

lines = LineCollection(segs, offsets=offsets, transOffset=None)
ax2.add_collection(lines)

# Set the yticks to use axes coordinates on the y axis
ax2.set_yticks(ticklocs)
ax2.set_yticklabels(['PG3', 'PG5', 'PG7', 'PG9'])

ax2.set_xlabel('Time (s)')


plt.tight_layout()
plt.show()
"""
===============
Watermark image
===============

Using a PNG file as a watermark.
"""

import numpy as np
import matplotlib.cbook as cbook
import matplotlib.image as image
import matplotlib.pyplot as plt


with cbook.get_sample_data('logo2.png') as file:
    im = image.imread(file)

fig, ax = plt.subplots()

ax.plot(np.sin(10 * np.linspace(0, 1)), '-o', ms=20, alpha=0.7, mfc='orange')
ax.grid()
fig.figimage(im, 10, 10, zorder=3, alpha=.5)
plt.savefig('watermark.jpg')
plt.show()

#############################################################################
#
# ------------
#
# References
# """"""""""
#
# The use of the following functions, methods, classes and modules is shown
# in this example:

import matplotlib
matplotlib.image
matplotlib.image.imread
matplotlib.pyplot.imread
matplotlib.figure.Figure.figimage
"""
======================
Whats New 0.99 Mplot3d
======================

Create a 3D surface plot.
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)

fig = plt.figure()
ax = Axes3D(fig)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.viridis)
plt.savefig('Axes3D.jpg')
plt.show()

#############################################################################
#
# ------------
#
# References
# """"""""""
#
# The use of the following functions, methods, classes and modules is shown
# in this example:

import mpl_toolkits
mpl_toolkits.mplot3d.Axes3D
mpl_toolkits.mplot3d.Axes3D.plot_surface

data visualization for US firearm background checks

we can download nics-firearm-background-checks.csv from https://github.com/BuzzFeedNews/nics-firearm-background-checks/tree/master/data
and put it in same folder with firearm_background.py:

#!/usr/bin/env python
import sys, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

checks = pd.read_csv('nics-firearm-background-checks.csv')

checks["year_int"] = checks["month"].apply(lambda x: int(x.split("-")[0]))
checks["month_int"] = checks["month"].apply(lambda x: int(x.split("-")[1]))

latest_month_count = checks.iloc[0]["month_int"] + (checks.iloc[0]["year_int"] * 12)

totals = checks[
    (checks["month_int"] + (checks["year_int"] * 12)) > (latest_month_count - 12*3)
].groupby("month")["totals"].sum()
tick_placement = np.arange(len(totals) - 1, 0, -3)

ax = totals.plot(kind="area", figsize=(12, 8), color="#000000", alpha=0.5)
ax.figure.set_facecolor("#FFFFFF")
ax.set_title("NICS Background Check Totals — Past 36 Months", fontsize=24)
ax.set_yticklabels([ "{0:,.0f}".format(y) for y in ax.get_yticks() ], fontsize=12)
ax.set_xticks(tick_placement)
ax.set_xticklabels([ totals.index[i] for i in tick_placement ])
ax.set_xlim(0, len(totals) - 1)
plt.setp(ax.get_xticklabels(), rotation=0, fontsize=12)
ax.set_xlabel("")

plt.savefig('image.jpg')
plt.show()

run the firearm_background.py, you will get image.jpg(default is png format, you need pip install pillow for saving as jpg format) saved and displayed:

Install Anaconda and jupyter notebook on Ubuntu 20

1. download and install latest Anaconda
curl -O https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh
bash Anaconda3-2020.02-Linux-x86_64.sh
2. Activate and test Installation
~~source ~/.bashrc
conda list
3. install nodejs ~~
(base) ubuntu@ubunu2004:~$ conda install -c conda-forge nodejs
4. Set Up Anaconda Environments
conda create –name my_env python=3
conda activate my_env
5. install jupyter and matplotlib
pip install jupyter
pip install matplotlib
6. start jupyter notebook
(my_env) ubuntu@ubunu2004:~$ jupyter notebook
[I 15:45:31.229 NotebookApp] Writing notebook server cookie secret to /home/ubuntu/.local/share/jupyter/runtime/notebook_cookie_secret
[I 15:45:33.363 NotebookApp] Serving notebooks from local directory: /home/ubuntu
[I 15:45:33.365 NotebookApp] The Jupyter Notebook is running at:
[I 15:45:33.366 NotebookApp] http://localhost:8888/?token=d477ec6ca56d2382cc3f87392a06a8a6000a3aba2d367fc8
[I 15:45:33.372 NotebookApp] or http://127.0.0.1:8888/?token=d477ec6ca56d2382cc3f87392a06a8a6000a3aba2d367fc8
[I 15:45:33.373 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
7. create new sample.py code in jupyter notebook:
open http://localhost:8888/?token=d477ec6ca56d2382cc3f87392a06a8a6000a3aba2d367fc8 and create sample.py code:

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt

objects = ('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
y_pos = np.arange(len(objects))
performance = [10,8,6,4,2,1]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Usage')
plt.title('Programming language usage')

plt.show()

Output:

  1. secure and bind jupyter on all IP
    (my_env) ubuntu@ubunu2004:~$ jupyter notebook –generate-config
    Writing default config to: /home/ubuntu/.jupyter/jupyter_notebook_config.py
    (my_env) ubuntu@ubunu2004:~$ jupyter notebook password
    Enter password:
    Verify password:
    [NotebookPasswordApp] Wrote hashed password to /home/ubuntu/.jupyter/jupyter_notebook_config.json
    {
    "NotebookApp": {
    "password": "sha1:de4ae15e9dcd:d32f371026e38a9241ec83a469de166b97ea60b7"
    }
    }

(my_env) ubuntu@ubunu2004:~$ openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout mykey.key -out mycert.pem
Generating a RSA private key ….

then you can vi /home/ubuntu/.jupyter/jupyter_notebook_config.py:

# Set options for certfile, ip, password, and toggle off
# browser auto-opening
c.NotebookApp.certfile = u'/home/ubuntu/mycert.pem'
c.NotebookApp.keyfile = u'/home/ubuntu/mykey.key'
# Set ip to '*' to bind on all interfaces (ips) for the public server
c.NotebookApp.ip = '*'
c.NotebookApp.password = u'sha1:de4ae15e9dcd:d32f371026e38a9241ec83a469de166b97ea60b7'
c.NotebookApp.open_browser = False

# It is a good idea to set a known, fixed port for server access
c.NotebookApp.port = 9999

start it with new port and binding:
(my_env) ubuntu@ubunu2004:~$ jupyter notebook
[I 16:48:20.227 NotebookApp] Serving notebooks from local directory: /home/ubuntu
[I 16:48:20.230 NotebookApp] The Jupyter Notebook is running at:
[I 16:48:20.230 NotebookApp] https://ubunu2004:9999/
Now you can open jupyter notebook from remote browser with your password!

Free Data Sets for Data Science Projects

18 Places to Find Free Data Sets for Data Science Projects

If you’ve ever worked on a personal data science project, you’ve probably spent a lot of time browsing the internet looking for interesting data sets to analyze. It can be fun to sift through dozens of data sets to find the perfect one. But it can also be frustrating to download and import several csv files, only to realize that the data isn’t that interesting after all. Luckily, there are online repositories that curate data sets and (mostly) remove the uninteresting ones.

In this post, we’ll walk through several types of data science projects, including data visualization projects, data cleaning projects, and machine learning projects, and identify good places to find data sets for each. Whether you want to strengthen your data science portfolio by showing that you can visualize data well, or you have a spare few hours and want to practice your machine learning skills, we’ve got you covered.

Data Sets for Data Visualization Projects

A typical data visualization project might be something along the lines of “I want to make an infographic about how income varies across the different states in the US”. There are a few considerations to keep in mind when looking for good data for a data visualization project:

It shouldn’t be messy, because you don’t want to spend a lot of time cleaning data.

It should be nuanced and interesting enough to make charts about.

Ideally, each column should be well-explained, so the visualization is accurate.

The data set shouldn’t have too many rows or columns, so it’s easy to work with.

News sites that release their data publicly can be great places to find data sets for data visualization. They typically clean the data for you, and they often already have charts they’ve made that you can learn from, replicate, or improve.

  1. FiveThirtyEight

If you’re interested in data at all, you’ve almost certainly heard of FiveThirtyEight; it’s one of the best-established data journalism outlets in the world. They write interesting data-driven articles, like “Don’t blame a skills gap for lack of hiring in manufacturing” and “2016 NFL Predictions”.

What you may not know is that FiveThirtyEight also makes the data sets used in its articles available online on Github and on its own data portal.

Here are some examples:

Airline Safety — contains information on accidents from each airline.

US Weather History — historical weather data for the US.

Study Drugs — data on who’s taking Adderall in the US.

  1. BuzzFeed

BuzzFeed may have started as a purveyor of low-quality clickbait, but these days it also does high-quality data journalism. And, much like FiveThirtyEight, it publishes some of its datasets publicly on its Github page.

Here are some examples:

Federal Surveillance Planes — contains data on planes used for domestic surveillance.

Zika Virus — data about the geography of the Zika virus outbreak.

Firearm background checks — data on background checks of people attempting to buy firearms.

  1. ProPublica

ProPublica is a nonprofit investigative reporting outlet that publishes data journalism on focused on issues of public interest, primarily in the US. They maintain a data store that hosts quite a few free data sets in addition to some paid ones (scroll down on that page to get past the paid ones). Many of them are actively maintained and frequently updated. ProPublica also offers five data-related APIs, four of which are accessible for free.

Here are some examples:

Political advertisements on Facebook — a free collection of data about Facebook ads that is updated daily.

Hate crime news — regularly-updated data about hate crimes reported in Google News.

Voting machine age — data on the age of voting machines that were used in the 2016 election.

Data sets for Data Processing Projects

Sometimes you just want to work with a large data set. The end result doesn’t matter as much as the process of reading in and analyzing the data. You might use tools like Spark (which you can learn in our Spark course) or Hadoop to distribute the processing across multiple nodes. Things to keep in mind when looking for a good data processing data set:

The cleaner the data, the better — cleaning a large data set can be very time consuming.

There should be an interesting question that can be answered with the data.

Cloud hosting providers like Amazon and Google are good places to find big data sets. They have an incentive to host data, because they can make you analyze that data using their infrastructure (and thus pay them).

  1. AWS Public Data sets

Amazon makes large data sets available on its Amazon Web Services platform. You can download the data and work with it on your own computer, or analyze the data in the cloud using EC2 and Hadoop via EMR.

You can read more about how the program works here, and check out the data sets for yourself here (although you’ll need a free AWS account first).

Here are some examples:

Lists of n-grams from Google Books — common words and groups of words from a huge set of books.

Common Crawl Corpus — data from a crawl of over 5 billion web pages.

Landsat images — moderate resolution satellite images of the surface of the Earth.

  1. Google Public Data sets

Google also has a cloud hosting service, which is called Google Cloud. With Google Cloud, you can use a tool called BigQuery to explore large data sets.

Google lists all of the data sets on this page. You’ll need to sign up for a Google Cloud account to see it, but the first 1TB of queries you make each month are free, so as long as you’re careful, you won’t have to pay anything.

Here are some examples:

USA Names — contains all Social Security name applications in the US, from 1879 to 2015.

Github Activity — contains all public activity on over 2.8 million public Github repositories.

Historical Weather — data from 9000 NOAA weather stations from 1929 to 2016.

  1. Wikipedia

Wikipedia is a free, online, community-edited encyclopedia. It contains an astonishing breadth of knowledge, containing pages on everything from the Ottoman-Habsburg Wars to Leonard Nimoy. As part of Wikipedia’s commitment to advancing knowledge, they offer all of their content for free, and regularly generate dumps of all the articles on the site. Additionally, Wikipedia offers edit history and activity data, so you can track how a page on a topic evolves over time, and who contributes to it.

Methods and a how-to guide for downloading the data are available here.

Here are some examples:

All images and other media from Wikipedia — all the images and other media files on Wikipedia.

Full site dumps — of the content on Wikipedia, in various formats.

Data Sets for Machine Learning Projects

When you’re working on a machine learning project, you want to be able to predict a column using information from the other columns of a data set. In order to be able to do this, we need to make sure that:

The data set isn’t too messy — if it is, we’ll spend all of our time cleaning the data.

There’s an interesting target column to make predictions for.

The other variables have some explanatory power for the target column.

There are a few online repositories of data sets curated specifically for machine learning. These data sets are typically cleaned up beforehand, and allow for testing algorithms very quickly.

  1. Kaggle

Kaggle is a data science community that hosts machine learning competitions. There are a variety of externally-contributed interesting data sets on the site. Kaggle has both live and historical competitions. You can download data for either, but you have to sign up for Kaggle and accept the terms of service for the competition.

You can download data from Kaggle by entering a competition. Each competition has its own associated data set. There are also user-contributed data sets available here, though these may be less well cleaned and curated than the data sets used for competitions.

Here are some examples:

Satellite Photograph Order — a set of satellite photos of Earth — the goal is to predict which photos were taken earlier than others.

Manufacturing Process Failures — a collection of variables that were measured during the manufacturing process. The goal is to predict faults with the manufacturing.

Multiple Choice Questions — a data set of multiple choice questions and the corresponding correct answers. The goal is to predict the answer for any given question.

  1. UCI Machine Learning Repository

The UCI Machine Learning Repository is one of the oldest sources of data sets on the web. Although the data sets are user-contributed, and thus have varying levels of documentation and cleanliness, the vast majority are clean and ready for machine learning to be applied. UCI is a great first stop when looking for interesting data sets.

You can download data directly from the UCI Machine Learning repository, without registration. These data sets tend to be fairly small, and don’t have a lot of nuance, but they’re great for machine learning

Here are some examples:

Email spam — contains emails, along with a label of whether or not they’re spam.

Wine classification — contains various attributes of 178 different wines.

Solar flares — attributes of solar flares, useful for predicting characteristics of flares.

  1. Quandl

Quandl is a repository of economic and financial data. Some of this information is free, but many data sets require purchase. Quandl is useful for building models to predict economic indicators or stock prices. Due to the large amount of available data, it’s possible to build a complex model that uses many data sets to predict values in another.

Here are some examples:

Entrepreneurial activity by race and other factors — contains data from the Kauffman foundation on entrepreneurs in the US.

Chinese macroeconomic data — indicators of Chinese economic health.

US Federal Reserve data — US economic indicators, from the Federal Reserve.

Data Sets for Data Cleaning Projects

Sometimes, it can be very satisfying to take a data set spread across multiple files, clean it up, condense it all into a single file, and then do some analysis. In data cleaning projects, it can take hours of research to figure out what each column in the data set means. It may turn out that the data set you’re analyzing isn’t really suitable for what you’re trying to do, and you’ll need to start over.

That can be frustrating, but it’s a common part of every data science job, and it requires practice.

When looking for a good data set for a data cleaning project, you want it to:

Be spread over multiple files.

Have a lot of nuance, and many possible angles to take.

Require a good amount of research to understand.

Be as “real-world” as possible.

These types of data sets are typically found on websites that collect and aggregate data sets. These aggregators tend to have data sets from multiple sources, without much curation. In this case, that’s a good thing — too much curation gives us overly neat data sets that are hard to do extensive cleaning on.

  1. data.world

Data.world is a user-driven data collection site (among other things) where you can search for, copy, analyze, and download data sets. You can also upload your own data to data.world and use it to collaborate with others.

The site includes some key tools that make working with data from the browser easier. You can write SQL queries within the site interface to explore data and join multiple data sets. They also have SDKs for R and Python that make it easier to acquire and work with data in your tool of choice (and you might be interested in reading our tutorial on using the data.world Python SDK.)

All of the data is accessible from the main site, but you’ll need to create an account, log in, and then search for the data you’d like.

Here are some examples:

Climate Change Data — a large set of climate change data from the World Bank.

European soccer data — data on soccer/football in 11 European countries from 2008-2016.

Big Cities Health — health data for major cities in the US.

  1. Data.gov

Data.gov an aggregator of public data sets from a variety of US government agencies, as part of a broader push towards more open government. Data can range from government budgets to school performance scores. Much of the data requires additional research, and it can sometimes be hard to figure out which data set is the “correct” version. Anyone can download the data, although some data sets will ask you to jump through additional hoops, like agreeing to licensing agreements before downloading.

You can browse the data sets on Data.gov directly, without registering. You can browse by topic area, or search for a specific data set.

Here are some examples:

Food Environment Atlas — contains data on how local food choices affect diet in the US.

School System Finances — a survey of the finances of school systems in the US.

  1. The World Bank

The World Bank is a global development organization that offers loans and advice to developing countries. The World Bank regularly funds programs in developing countries, then gathers data to monitor the success of these programs.

You can browse world bank data sets directly, without registering. The data sets have many missing values (which is great for cleaning practice), and it sometimes takes several clicks to actually get to data.

Here are some examples:

World Development Indicators — contains country level information on development.

Educational Statistics — data on education by country.

World Bank project Costs — data on World Bank projects and their corresponding costs.

  1. /r/datasets

Reddit, a popular community discussion site, has a section devoted to sharing interesting data sets. It’s called the datasets subreddit, or /r/datasets. The scope and quality of these data sets varies a lot, since they’re all user-submitted, but they are often very interesting and nuanced.

You can browse the subreddit here without an account (although a free account will be required to comment or submit data sets yourself). You can also see the most highly-upvoted data sets of all time here.

Here are some examples:

All Reddit Submissions — contains reddit submissions through 2015.

Jeopardy Questions — questions and point values from the gameshow Jeopardy.

New York City Property Tax Data — data about properties and assessed value in New York City.

  1. Academic Torrents

Academic Torrents is data aggregator geared toward sharing the data sets from scientific papers. It has all sorts of interesting (and often massive) data sets, although it can sometimes be difficult to get context on a particular data set without reading the original paper and/or having some expertise in the relevant domains of science.

You can browse the data sets directly on the site. Since it’s a torrent site, all of the data sets can be immediately downloaded, but you’ll need a Bittorrent client. Deluge is a good free option that’s available for Windows, Mac, and Linux.

Here are some examples:

Enron Emails — a set of many emails from executives at Enron, a company that famously went bankrupt.

Student Learning Factors — a set of factors that measure and influence student learning.

News Articles — contains news article attributes and a target variable.

Bonus: Streaming data

When you’re building a data science project, it’s very common to download a data set and then process it.

However, as online services generate more and more data, an increasing amount is available in real-time, and not available in downloadable data set form. Some examples of this include data on tweets from Twitter and stock price data. There aren’t many good sources to acquire this kind of data in downloadable form, and a downloadable file would be quickly out of date anyway. Instead, this data is often available in real time as streaming data, via an API.

Here are a few good streaming data sources in case you want to try your hand at a streaming data project.

  1. Twitter

Twitter has a good streaming API, and makes it relatively straightforward to filter and stream tweets. You can get started here. There are tons of options here — you could figure out what states are the happiest, or which countries use the most complex language. If you’d like some help getting started working with this Twitter API, check out our tutorial here.

  1. Github

GitHub has an API that allows you to access repository activity and code. You can get started with the API here. The options are endless — you could build a system to automatically score code quality, or figure out how code evolves over time in large projects.

  1. Quantopian

Quantopian is a site where you can develop, test, and optimize stock trading algorithms. In order to help you do that, the site gives you access to free minute-by-minute stock price data, which you can use to build a stock price prediction algorithm.

  1. Wunderground

Wunderground has an API for weather forecasts that is free up to 500 API calls per day. You could use these calls to build up a set of historical weather data, and then use that to make predictions about the weather tomorrow.

An A-Z of useful Python tricks

Python is one of the world’s most popular, in-demand programming languages. This is for many reasons:

  • it’s easy to learn
  • it’s super versatile
  • it has a huge range of modules and libraries

I use Python daily as an integral part of my job as a data scientist. Along the way, I’ve picked up a few useful tricks and tips.

Here, I’ve shared some of them in an A-Z format.

Most of these ‘tricks’ are things I’ve used or stumbled upon during my day-to-day work. Some I found while browsing the Python Standard Library docs. A few others I found searching through PyPi.

However, credit where it is due — I discovered four or five of them over at awesome-python.com. This is a curated list of hundreds of interesting Python tools and modules. It is worth browsing for inspiration!

all or any

One of the many reasons why Python is such a popular language is because it is readable and expressive.

It is often joked that Python is ‘executable pseudocode’. But when you can write code like this, it’s difficult to argue otherwise:

x = [True, True, False]
if any(x):
    print("At least one True")
if all(x):
    print("Not one False")
if any(x) and not all(x):    
    print("At least one True and one False")

bashplotlib

You want to plot graphs in the console?

$ pip install bashplotlib

You can have graphs in the console.

collections

Python has some great default datatypes, but sometimes they just won’t behave exactly how you’d like them to.

Luckily, the Python Standard Library offers the collections module. This handy add-on provides you with further datatypes.

from collections import OrderedDict, Counter

# Remembers the order the keys are added!
x = OrderedDict(a=1, b=2, c=3)

# Counts the frequency of each character
y = Counter("Hello World!")

dir

Ever wondered how you can look inside a Python object and see what attributes it has? Of course you have.

From the command line:

>>> dir()
>>> dir("Hello World")
>>> dir(dir)

This can be a really useful feature when running Python interactively, and for dynamically exploring objects and modules you are working with.

Read more here.

emoji

Yes, really.

$ pip install emoji

Don’t pretend you’re not gonna try it out…

from emoji import emojize
print(emojize(":thumbs_up:"))

from __future__ import

One consequence of Python’s popularity is that there are always new versions under development. New versions mean new features — unless your version is out-of-date.

Fear not, however. The __future__ module lets you import functionality from future versions of Python. It’s literally like time travel, or magic, or something.

from __future__ import print_function
print("Hello World!")

Why not have a go importing curly braces?

geopy

Geography can be a challenging terrain for programmers to navigate (ha, a pun!). But the geopy module makes it unnervingly easy.

$ pip install geopy

It works by abstracting the APIs of a range of different geocoding services. It enables you to obtain a place’s full street address, latitude, longitude, and even altitude.

There’s also a useful distance class. It calculates the distance between two locations in your favorite unit of measurement.

from geopy import GoogleV3

place = "221b Baker Street, London"
location = GoogleV3().geocode(place)
print(location.address)
print(location.location)

howdoi

Stuck on a coding problem and can’t remember that solution you saw before? Need to check StackOverflow, but don’t want to leave the terminal?

Then you need this useful command line tool.

$ pip install howdoi

Ask it whatever question you have, and it’ll do its best to return an answer.

$ howdoi vertical align css
$ howdoi for loop in java
$ howdoi undo commits in git

Be aware though — it scrapes code from top answers from StackOverflow. It might not always give the most helpful information…

$ howdoi exit vim

inspect

Python’s inspect module is great for understanding what is happening behind the scenes. You can even call its methods on itself!

The code sample below uses inspect.getsource() to print its own source code. It also uses inspect.getmodule() to print the module in which it was defined.

The last line of code prints out its own line number.

import inspect

print(inspect.getsource(inspect.getsource))
print(inspect.getmodule(inspect.getmodule))
print(inspect.currentframe().f_lineno)

Of course, beyond these trivial uses, the inspect module can prove useful for understanding what your code is doing. You could also use it for writing self-documenting code.

Jedi

The Jedi library is an autocompletion and code analysis library. It makes writing code quicker and more productive.

Unless you’re developing your own IDE, you’ll probably be most interested in using Jedi as an editor plugin. Luckily, there are already loads available!

You may already be using Jedi, however. The IPython project makes use of Jedi for its code autocompletion functionality.

**kwargs

When learning any language, there are many milestones along the way. With Python, understanding the mysterious **kwargs syntax probably counts as one.

The double-asterisk in front of a dictionary object lets you pass the contents of that dictionary as named arguments to a function.

The dictionary’s keys are the argument names, and the values are the values passed to the function. You don’t even need to call it kwargs!

dictionary = {"a": 1, "b": 2}

def someFunction(a, b):
    print(a + b)
    return
    
# these do the same thing:
someFunction(**dictionary)
someFunction(a=1, b=2)

This is useful when you want to write functions that can handle named arguments not defined in advance.

List comprehensions

One of my favourite things about programming in Python are its list comprehensions.

These expressions make it easy to write very clean code that reads almost like natural language.

You can read more about how to use them here.

numbers = [1,2,3,4,5,6,7]
evens = [x for x in numbers if x % 2 is 0]
odds = [y for y in numbers if y not in evens]

cities = ['London', 'Dublin', 'Oslo']

def visit(city):
    print("Welcome to "+city)
for city in cities:
    visit(city)

map

Python supports functional programming through a number of inbuilt features. One of the most useful is the map() function — especially in combination with lambda functions.

x = [1, 2, 3]
y = map(lambda x : x + 1 , x)
# prints out [2,3,4]print(list(y))

In the example above, map() applies a simple lambda function to each element in x. It returns a map object, which can be converted to some iterable object such as a list or tuple.

newspaper3k

If you haven’t seen it already, then be prepared to have your mind blown by Python’s newspaper module.

It lets you retrieve news articles and associated meta-data from a range of leading international publications. You can retrieve images, text and author names.

It even has some inbuilt NLP functionality.

So if you were thinking of using BeautifulSoup or some other DIY webscraping library for your next project, save yourself the time and effort and $ pip install newspaper3k instead.

Operator overloading

Python provides support for operator overloading, which is one of those terms that make you sound like a legit computer scientist.

It’s actually a simple concept. Ever wondered why Python lets you use the + operator to add numbers and also to concatenate strings? That’s operator overloading in action.

You can define objects which use Python’s standard operator symbols in their own specific way. This lets you use them in contexts relevant to the objects you’re working with.

class Thing:
    def __init__(self, value):
        self.__value = value
    def __gt__(self, other):
        return self.__value > other.__value
    def __lt__(self, other):
        return self.__value < other.__value

something = Thing(100)
nothing = Thing(0)

# True
something > nothing

# False
something < nothing

# Error
something + nothing

pprint

Python’s default print function does its job. But try printing out any large, nested object, and the result is rather ugly.

Here’s where the Standard Library’s pretty-print module steps in. This prints out complex structured objects in an easy-to-read format.

A must-have for any Python developer who works with non-trivial data structures.

import requests
import pprint

url = 'https://randomuser.me/api/?results=1'
users = requests.get(url).json()
pprint.pprint(users)

Queue

Python supports multithreading, and this is facilitated by the Standard Library’s Queue module.

This module lets you implement queue data structures. These are data structures that let you add and retrieve entries according to a specific rule.

‘First in, first out’ (or FIFO) queues let you retrieve objects in the order they were added. ‘Last in, first out’ (LIFO) queues let you access the most recently added objects first.

Finally, priority queues let you retrieve objects according to the order in which they are sorted.

Here’s an example of how to use queues for multithreaded programming in Python.

__repr__

When defining a class or an object in Python, it is useful to provide an ‘official’ way of representing that object as a string. For example:

>>> file = open('file.txt', 'r')
>>> print(file)
<open file 'file.txt', mode 'r' at 0x10d30aaf0>

This makes debugging code a lot easier. Add it to your class definitions as below:

class someClass:
    def __repr__(self):
        return "<some description here>"
        
someInstance = someClass()

# prints <some description here>
print(someInstance)

sh

Python makes a great scripting language. Sometimes using the standard os and subprocess libraries can be a bit of a headache.

The sh library provides a neat alternative.

It lets you call any program as if it were an ordinary function — useful for automating workflows and tasks, all from within Python.

import sh
sh.pwd()
sh.mkdir('new_folder')
sh.touch('new_file.txt')
sh.whoami()
sh.echo('This is great!')

Type hints

Python is a dynamically-typed language. You don’t need to specify datatypes when you define variables, functions, classes etc.

This allows for rapid development times. However, there are few things more annoying than a runtime error caused by a simple typing issue.

Since Python 3.5, you have the option to provide type hints when defining functions.

def addTwo(x : Int) -> Int:    return x + 2

You can also define type aliases:

from typing import List
Vector = List[float]Matrix = List[Vector]
def addMatrix(a : Matrix, b : Matrix) -> Matrix:
    result = []
    for i,row in enumerate(a):
        result_row =[]
        for j, col in enumerate(row):
            result_row += [a[i][j] + b[i][j]]
        result += [result_row]
    return result

x = [[1.0, 0.0], [0.0, 1.0]]
y = [[2.0, 1.0], [0.0, -2.0]]
z = addMatrix(x, y)

Although not compulsory, type annotations can make your code easier to understand.

They also allow you to use type checking tools to catch those stray TypeErrors before runtime. Probably worthwhile if you are working on large, complex projects!

uuid

A quick and easy way to generate Universally Unique IDs (or ‘UUIDs’) is through the Python Standard Library’s uuid module.

import uuid

user_id = uuid.uuid4()
print(user_id)

This creates a randomized 128-bit number that will almost certainly be unique.

In fact, there are over 2¹²² possible UUIDs that can be generated. That’s over five undecillion (or 5,000,000,000,000,000,000,000,000,000,000,000,000).

The probability of finding duplicates in a given set is extremely low. Even with a trillion UUIDs, the probability of a duplicate existing is much, much less than one-in-a-billion.

Pretty good for two lines of code.

Virtual environments

This is probably my favorite Python thing of all.

Chances are you are working on multiple Python projects at any one time. Unfortunately, sometimes two projects will rely on different versions of the same dependency. Which do you install on your system?

Luckily, Python’s support for virtual environments lets you have the best of both worlds. From the command line:

python -m venv my-project
source my-project/bin/activate
pip install all-the-modules 

Now you can have standalone versions and installations of Python running on the same machine. Sorted!

wikipedia

Wikipedia has a great API that allows users programmatic access to an unrivalled body of completely free knowledge and information.

The wikipedia module makes accessing this API almost embarrassingly convenient.

import wikipedia

result = wikipedia.page('freeCodeCamp')
print(result.summary)

for link in result.links:
    print(link)

Like the real site, the module provides support for multiple languages, page disambiguation, random page retrieval, and even has a donate() method.

xkcd

Humour is a key feature of the Python language — after all, it is named after the British comedy sketch show Monty Python’s Flying Circus. Much of Python’s official documentation references the show’s most famous sketches.

The sense of humour isn’t restricted to the docs, though. Have a go running the line below:

import antigravity

Never change, Python. Never change.

YAML

YAML stands for ‘YAML Ain’t Markup Language’. It is a data formatting language, and is a superset of JSON.

Unlike JSON, it can store more complex objects and refer to its own elements. You can also write comments, making it particularly suited to writing configuration files.

The PyYAML module lets you use YAML with Python. Install with:

$ pip install pyyaml

And then import into your projects:

import yaml

PyYAML lets you store Python objects of any datatype, and instances of any user-defined classes also.

zip

One last trick for ya, and it really is a cool one. Ever needed to form a dictionary out of two lists?

keys = ['a', 'b', 'c']
vals = [1, 2, 3]
zipped = dict(zip(keys, vals))

The zip() inbuilt function takes a number of iterable objects and returns a list of tuples. Each tuple groups the elements of the input objects by their positional index.

You can also ‘unzip’ objects by calling *zip() on them.

Thanks for reading!

How to Install PHP7.4 and phpMyAdmin on Ubuntu 20.04

Step 1 – Install Apache2 and PHP 7.4
sudo apt install apache2 wget unzip
sudo apt install software-properties-common
sudo add-apt-repository ppa:ondrej/php
sudo apt update
sudo apt install -y php7.4
sudo apt install php7.4-mysql php7.4-curl php7.4-json php7.4-cgi php7.4-xsl
sudo apt install php7.4-zip php7.4-mbstring
sudo systemctl enable apache2
sudo systemctl start apache2
Step 2 – Install phpMyAdmin on Ubuntu 20.04
wget https://files.phpmyadmin.net/phpMyAdmin/5.0.2/phpMyAdmin-5.0.2-all-languages.zip
unzip phpMyAdmin-5.0.2-all-languages.zip
sudo mkdir /usr/share/phpmyadmin
sudo mv phpMyAdmin-5.0.2-all-languages/* /usr/share/phpmyadmin
sudo mkdir /usr/share/phpmyadmin/tmp
sudo chown -R www-data:www-data /usr/share/phpmyadmin
sudo chmod 777 /usr/share/phpmyadmin/tmp
Step 3 – Configure phpMyAdmin
sudo vi /etc/apache2/conf-available/phpmyadmin.conf:

Alias /phpmyadmin /usr/share/phpmyadmin
Alias /phpMyAdmin /usr/share/phpmyadmin

<Directory /usr/share/phpmyadmin/>
   AddDefaultCharset UTF-8
   <IfModule mod_authz_core.c>
      <RequireAny>
      Require all granted
     </RequireAny>
   </IfModule>
</Directory>

<Directory /usr/share/phpmyadmin/setup/>
   <IfModule mod_authz_core.c>
     <RequireAny>
       Require all granted
     </RequireAny>
   </IfModule>
</Directory>

sudo a2enconf phpmyadmin
you will see the link in /etc/apache2/conf-enabled
sudo systemctl restart apache2
Step 4 – Adjusting FirewallD
sudo firewall-cmd –permanent –add-service=http
sudo firewall-cmd –reload
Step 5 – Access phpMyAdmin
http://your-server-ip-domain/phpmyadmin
Log in with the username and password used to access MySQL on the command line.

xpath locator in selenium

Inspect the element in Chrome, then CTRL+F, try to locate your element, like //input[@name=’email’], until you get only one result!

Some sample xpath locator:

link
//a[text()=’Features’]
//a[contains(text(),’Features’)]
button
//button[@type=’button’ and @class=’btn’]
//button[contains(text(), ‘Sign Up’)]
//div[@class=’dropdown’]//button[@type=’button’ and @class=’btn btn dropdown’ and @id=’dropdownMenu’]
//button[@id=’dropdownMenu’]
checkbox
//a[text()=’test2 test2′]//parent::td[@class=’datalistrow’]//preceding-sibling::td[@class=’datalistrow’]//input[@nmae=’contact_id’]
driver.findElement(By.xpath("//a[text()=’test2 test2′]//parent::td[@class=’datalistrow’]//preceding-sibling::td[@class=’datalistrow’]//input[@nmae=’contact_id’]")).click()

Enable Wily for the specified WebSphere JVM Instance

#
--------------------------------------------------------------------------
# Enable Wily for the specified JVM Instance
#
--------------------------------------------------------------------------

def _splitlist(s):
    """Given a string of the form [item item item], return a list of
strings, one per item.
    WARNING: does not yet work right when an item has spaces.  I believe in
that case we'll be
    given a string like '[item1 "item2 with spaces" item3]'.
    """
    if s[0] != '[' or s[-1] != ']':
        raise "Invalid string: %s" % s
    #endIf
    return s[1:-1].split(' ')
#endDef _splitlist

def wsadminToList(inStr):
        inStr = inStr.rstrip();
        outList=[]
        if (len(inStr)>0 and inStr[0]=='[' and inStr[-1]==']'):
                tmpList = inStr[1:-1].split(" ")
        else:
                tmpList = inStr.split("\n")   #splits for Windows or Linux
        #endIfElse
        for item in tmpList:
                item = item.rstrip();         #removes any Windows "\r"
                if (len(item)>0):
                      outList.append(item)
                #endIf
        #endFor
        return outList
#endDef wsadminToList

def installCustomService ( server, cs_attribs ):
        serverName = AdminConfig.showAttribute(server, "name" )
        customServices = AdminConfig.getid("/Cell:"+cellName+"/Node:"+nodeName+"/Server:"+serverName+"/CustomService:/" )
        customServices = wsadminToList(customServices)
        # Find displayName attribute
        customServiceDisplayName = ""
        for attrib in cs_attribs:
                name = attrib[0]
                if (cmp(name, "displayName") == 0):
                        customServiceDisplayName = attrib[1]
                #endIf
        #endFor

        if (len(customServiceDisplayName) == 0):
                print "---> ERROR in installCustomService - displayName attribute not set"
                sys.exit()
        #endIf

        found = "false"

        for customService in customServices:
                displayName = AdminConfig.showAttribute(customService, "displayName" )
                if (cmp(displayName, customServiceDisplayName) == 0):
                        found = "true"
                        print "     "+displayName+" CustomService object exists! Modifying ......"
                        AdminConfig.modify(customService, cs_attribs )
                        print "     Done."
                        break
                #endIf
        #endFor

        if (cmp(found, "false") == 0):
                print "     Creating "+customServiceDisplayName+" Custom Service ......"
                AdminConfig.create("CustomService", server, cs_attribs )
                print "     Done."
        #endIf
#endDef


# Uncomment the line below for Debugging this script.
# wsadminlib.enableDebugMessages()
serverName = "";

for arg in sys.argv:
    if (arg.startswith("-appServerName=")):
        serverName = arg[15:]
    #endIf
#endFor

for arg in sys.argv:
    if (arg.startswith("-nodeName=")):
        nodeName = arg[10:]
    #endIf
#endFor

genericArgument = "-XX:-UseSplitVerifier -javaagent:/wily/wily/Introscope95/AgentNoRedef.jar -Dcom.wily.introscope.agentProfile=/wily/wily/Introscope95/core/config/profiles/"  + serverName + "-IntroscopeAgent.profile"
wilyCustomServiceClassName = "com.wily.introscope.api.websphere.IntroscopeCustomService"
wilyCustomServiceClassPath = "/wily/wily/Introscope95/common/WebAppSupport.jar"
wilyCustomServiceDisplayName = "Introscope Custom Services"

cellConfigId = AdminConfig.getid('/Cell:/')
cellName=AdminConfig.showAttribute(cellConfigId, 'name')
nodeConfigId = AdminConfig.getid('/Node:/')
#nodeName = AdminConfig.showAttribute(nodeConfigId, 'name')
#nodeName = 'iedm2c16Node03'

# The command below should be used for AIX server environment
#serverConfigId = AdminConfig.getid('/Server:' + serverName + '/')
serverConfigId = AdminConfig.getid('/Cell:' + cellName + '/Node:' + nodeName + '/Server:' + serverName + '/')
# The command below should be used for WebSphere in local developer environment
#serverConfigId = AdminConfig.getid('/Server:/')
serverName = AdminConfig.showAttribute(serverConfigId, 'name')
print "Server Name " + serverName

# Add Generic JVM arguments
# the pdefs come back as a string [item item item]
jvmId = None
pdefs = _splitlist(AdminConfig.showAttribute(serverConfigId, 'processDefinitions'))
pdef = None
for p in pdefs:
    if -1 != p.find("JavaProcessDef"):
        pdef = p
        break
    #endIf
#endFor

if pdef: # found Java ProcessDef
    jvmId = _splitlist(AdminConfig.showAttribute(pdef, 'jvmEntries'))[0]
#endIf

currentGenericJvmArguments = AdminConfig.showAttribute(jvmId,'genericJvmArguments')
findSplitVerifier = currentGenericJvmArguments.find("-XX:-UseSplitVerifier")
if -1 != currentGenericJvmArguments.find("UseSplitVerifier"):
    print "-XX:UseSplitVerifier already set"
else:
    print "Add generic arguments"
    AdminConfig.resetAttributes(jvmId, [['genericJvmArguments', currentGenericJvmArguments + ' ' + genericArgument]] )
#endIf

installCustomService(serverConfigId,[['displayName',wilyCustomServiceDisplayName],['classname',wilyCustomServiceClassName],['classpath',wilyCustomServiceClassPath],['enable','true']])

#
-----------------------------------------------------------------------------

# Save all configuration changes
AdminConfig.save()

print "Script finished."
run the command in DMGR bin dir:
./wsadmin.sh -conntype SOAP -host iedm2c16 -port 8881 -f /tmp/EnableWily.py -appServerName=ABPD -nodeName=iedm2d16Node04
./wsadmin.sh -conntype SOAP -host iedm2c16 -port 8881 -f /tmp/EnableWily.py -appServerName=ABPD -nodeName=iedm2c16Node03

Selenium Unittest in Python [1]

https://www.seleniumeasy.com/python/selenium-webdriver-unittest-example
To support test automation, we need ‘unittest’ unit testing framework was originally inspired by JUnit. Unittest is included test module by default in the Python standard library. 
Unittest supports test automation, by sharing setup and shutdown code for tests. The setUp() and tearDown() methods allows to define set of instructions /commands to be executed before and after each test method. If the setUp() method it self raises an exception while executing tests, the test methods will not be executed. But If setUp() is executed successfully, tearDown() will run even if the test method is passed or not.
Below section demonstrates a quick look on unittest features using Selenium Webdriver:

import unittest
from selenium import webdriver

class InputFormsCheck(unittest.TestCase):

    #Opening browser.
    def setUp(self):
        self.driver = webdriver.Chrome(r"C:\Program Files\chromedriver.exe")
 
    #Testing Single Input Field.    
    def test_singleInputField(self):
        pageUrl = "http://www.seleniumeasy.com/test/basic-first-form-demo.html"
        driver=self.driver
        driver.maximize_window()
        driver.get(pageUrl)

        #Finding "Single input form" input text field by id. And sending keys(entering data) in it.
        eleUserMessage = driver.find_element_by_id("user-message")
        eleUserMessage.clear()
        eleUserMessage.send_keys("Test Python")

        #Finding "Show Your Message" button element by css selector using both id and class name. And clicking it.
        eleShowMsgBtn=driver.find_element_by_css_selector('#get-input > .btn')
        eleShowMsgBtn.click()

        #Checking whether the input text and output text are same using assertion.
        eleYourMsg=driver.find_element_by_id("display")
        assert "Test Python" in eleYourMsg.text
 
    # Closing the browser.
    def tearDown(self):
        self.driver.close()

# This line sets the variable “__name__” to have a value “__main__”.
# If this file is being imported from another module then “__name__” will be set to the other module's name.
if __name__ == "__main__":
    unittest.main()
C:\Users\zhuby\hans\testcase>main.py
DevTools listening on ws://127.0.0.1:57826/devtools/browser/6f03e012-bed9-4e5c-9775-7f79a3bf8770
.
Ran 1 test in 9.061s
OK

What is the above program doing ?

Step 1: It Opens chrome browser
Step 2: Enters the URL and maximize the browser
Step 3: Enters text ‘Test Python’ in the input text field
Step 4: Clicks on a button
Step 5: Finally, it checks for the text we have entered and the closes the browser.