Programming in Python (2)

Posted on 2022-04-20 Edited on 2022-08-07 In Programming Views: Symbols count in article: 45k Reading time ≈ 41 mins.

Numpy

Built on Numpy:
- SciPy
- Pandas
- matplotlib
- Tensorflow
- Scikit Learn
- PyTorch
- JAX
Numpy quick start guide: https://numpy.org/doc/stable/user/quickstart.html
closely related package scipy is for opimization
Basic numerical data types:
- bool
- int
- uint
- float
- complex
- https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html

`numpy.array`

https://docs.scipy.org/doc/numpy-1.10.1/user/basics.creation.html
compared to list:
- faster
- operation easy

import numpy as np

np.array([1,2,3],dtype='uint')

array([1, 2, 3], dtype=uint32)

1	np.zeros((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

1	np.arange(2,3,0.1,dtype='float')

array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])

小心大整数会导致数据溢出，不同机器结果不一样

1
2
3

import numpy as np
a = np.arange(100_000_000,dtype='int64')
np.sum(a)

4999999950000000

empty 有时候会有随机结果，容易出错

1	np.empty((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

`arange()`

1	np.arange(27).reshape(3,3,3)

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

1 2	a = np.arange(27).reshape(3,3,3) a[0]

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

1	np.arange(5,10)

array([5, 6, 7, 8, 9])

1	np.arange(0,1,0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

array indexing

1 2	x=np.arange(10) x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

x[:-2]

array([0, 1, 2, 3, 4, 5, 6, 7])

x[1:7:2]

array([1, 3, 5])

x[::2]

array([0, 2, 4, 6, 8])

1 2	x = np.reshape(np.arange(1,13),(3,4)) x

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

x[1]

array([5, 6, 7, 8])

1	x[:,(1,3)]

array([[ 2,  4],
       [ 6,  8],
       [10, 12]])

1	x[(0,2),(1,3)] # 表示（0，1）和（2，3），第二项表示第几列，第一项表示第几行

array([ 2, 12])

x[x>7]

array([ 8,  9, 10, 11, 12])

1	x[(True,False,False)]

array([], shape=(0, 2, 5), dtype=int32)

index 组合可以用np.ix_

1
2
3

x = np.arange(9).reshape(3,3)
print(x)
x[np.ix_([0,2],[0,1])]

[[0 1 2]
 [3 4 5]
 [6 7 8]]





array([[0, 1],
       [6, 7]])

Example: minist

1 2	import numpy as np minist = np.load('8/mnist.npz')

1	minist['images']

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

1	list(minist) # get key

['images', 'labels']

1	minist['images'].ndim

1	minist['images'].shape

(60000, 784)

1 2	img = minist['images'] img[0].shape # 一维

(784,)

img_0 = img[0].reshape(28,28)

%matplotlib inline
import matplotlib.pyplot as plt
plt.imshow(img_0,cmap="Greys")

<matplotlib.image.AxesImage at 0x2d675185120>

png

mnist中的数据按照uint8存储，所以加和是错误的

1	np.uint8(255).astype(int).dtype

dtype('int32')

numpy/scipy universal functions(ufuncs)

support vectorized operations
np.min快于min,max

1	sum(img.reshape(len(img),28,28)).shape

(28, 28)

1	np.mean(img)

33.318421449829934

1	img.reshape(-1).shape

(47040000,)

按照维度相加

1	plt.imshow(img[minist['labels']==8].mean(axis=0).reshape(28,28))

<matplotlib.image.AxesImage at 0x2d675301ab0>

png

Numpy example

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

salaries_full = pd.read_csv(
    "9/salaries_clean.csv", encoding="latin1", index_col="salary_id"
)
columns = [
    "location_latitude",
    "location_longitude",
    "total_experience_years",
    "employer_experience_years",
    "annual_base_pay",
    "signing_bonus",
]
salary_data = salaries_full[columns].to_numpy()
salary_data

array([[ 3.7770e+01, -1.2241e+02,  1.3000e+01,  2.0000e+00,  1.2500e+05,
         5.0000e+03],
       [ 3.6360e+01, -9.4200e+01,  1.5000e+01,  8.0000e+00,  6.5000e+04,
                nan],
       [ 4.1470e+01, -8.1670e+01,  4.0000e+00,  1.0000e+00,  8.6000e+04,
         5.0000e+03],
       ...,
       [        nan,         nan,  7.0000e+00,  1.0000e+00,  1.5000e+05,
         0.0000e+00],
       [ 3.8990e+01, -7.6930e+01,  5.0000e+00,  1.0000e+00,  7.5000e+04,
                nan],
       [        nan,         nan,  2.0000e+00,  0.0000e+00,  5.0000e+01,
         0.0000e+00]])

Numpy indexing conventirons:

outermost,innermost

1	salary_data[:,0].shape

(1655,)

col = np.array(columns)
col=='annual_base_pay'
annual_base_pay = salary_data[:,col=='annual_base_pay'] 
np.mean(annual_base_pay)

nan

index 可以boolean

1	np.nanmean(annual_base_pay)

6325003.746965475

1	np.isnan(annual_base_pay)

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

1	annual_base_pay.reshape(-1)

array([1.25e+05, 6.50e+04, 8.60e+04, ..., 1.50e+05, 7.50e+04, 5.00e+01])

1	annual_base_pay[np.isnan(annual_base_pay)]

array([nan, nan, nan, nan])

取反

1	~np.isnan(annual_base_pay)

array([[ True],
       [ True],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])

1
2
3

np.all(~np.isnan(salary_data),axis=1)
np.any(np.isnan(salary_data),axis=1)

array([False,  True, False, ...,  True,  True,  True])

1	salary_data[~np.any(np.isnan(salary_data),axis=1)]

array([[ 3.7770e+01, -1.2241e+02,  1.3000e+01,  2.0000e+00,  1.2500e+05,
         5.0000e+03],
       [ 4.1470e+01, -8.1670e+01,  4.0000e+00,  1.0000e+00,  8.6000e+04,
         5.0000e+03],
       [ 3.8000e+01, -9.7000e+01,  5.0000e+00,  1.5000e+00,  4.0000e+04,
         0.0000e+00],
       ...,
       [ 4.0030e+01, -7.5630e+01,  2.0000e+00,  1.0000e+00,  6.0000e+04,
         0.0000e+00],
       [ 3.9950e+01, -7.5160e+01,  6.0000e+00,  2.5000e+00,  1.1100e+05,
         5.0000e+03],
       [ 4.0020e+01, -1.0525e+02,  1.0000e+00,  5.0000e-01,  6.0500e+04,
         0.0000e+00]])

last 2 rows

1	salary_data[-2:]

array([[ 3.899e+01, -7.693e+01,  5.000e+00,  1.000e+00,  7.500e+04,
               nan],
       [       nan,        nan,  2.000e+00,  0.000e+00,  5.000e+01,
         0.000e+00]])

list 是行的index

1	salary_data[[1,2,3]]

array([[ 3.636e+01, -9.420e+01,  1.500e+01,  8.000e+00,  6.500e+04,
               nan],
       [ 4.147e+01, -8.167e+01,  4.000e+00,  1.000e+00,  8.600e+04,
         5.000e+03],
       [       nan,        nan,  4.000e+00,  0.000e+00,  1.050e+05,
         5.000e+03]])

tuple 是坐标

1 2	# salary_data[(1,2,3)] # Error: () 代表坐标 salary_data[(1,2)]

15.0

Normalizing Data

1	col_mean = np.nanmean(salary_data,axis=0)

array([ 3.77543939e+01, -6.47122727e+01,  6.75611318e+00,  2.65616294e+00,
        6.32500375e+06,  2.24056269e+04])

Broad casting

Note: 如果矩阵很大，可能会溢出，这样就得用for loop来写

Broad cast的矩阵，要么某些维度数量match，要么就是1或没有

$5\times4$ ; 1
$5\times4$ ; 4
但是$5\times4$如果要每行减去一列数字，不能是一个长度为5的array，因为匹配的是innermost index，应该reshape(1,5) 或者 [:,None]即最后加一个维度=1

1
2
3

salary_data.mean(1)
salary_data.mean(axis=1) # mean of each row

array([21655.06,      nan, 15160.8 , ...,      nan,      nan,      nan])

1
2
3

row_mu = salary_data.mean(axis=1)
row_mu.shape,row_mu[:,None].shape,row_mu[:,None,None].shape

((1655,), (1655, 1), (1655, 1, 1))

1	salary_data-row_mu[:,None]

array([[-21617.29, -21777.47, -21642.06, -21653.06, 103344.94, -16655.06],
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [-15119.33, -15242.47, -15156.8 , -15159.8 ,  70839.2 , -10160.8 ],
       ...,
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [      nan,       nan,       nan,       nan,       nan,       nan]])

1	np.array(1).shape,np.array(1)[None].shape

((), (1,))

1 2	row_mu = salary_data.mean(axis=1,keepdims=True) row_mu.shape

(1655, 1)

Reductions and axis=
- that axis will disapear

1	salary_data.mean(axis=1)

array([21655.06,      nan, 15160.8 , ...,      nan,      nan,      nan])

for loops too slow

1 2	prices = np.array([20,18,21,14]) prices - np.minimum.accumulate(prices)

array([0, 0, 3, 0])

n = 10
p = 2
k = 3

x = np.random.rand(n,p)
c = np.random.rand(k,p)
x.shape

(10, 2)

array([[0.57019677, 0.43860151],
       [0.98837384, 0.10204481],
       [0.20887676, 0.16130952],
       [0.65310833, 0.2532916 ],
       [0.46631077, 0.24442559],
       [0.15896958, 0.11037514],
       [0.65632959, 0.13818295],
       [0.19658236, 0.36872517],
       [0.82099323, 0.09710128],
       [0.83794491, 0.09609841]])

1	np.random.rand(10,2)

array([[0.58651293, 0.02010755],
       [0.82894003, 0.00469548],
       [0.67781654, 0.27000797],
       [0.73519402, 0.96218855],
       [0.24875314, 0.57615733],
       [0.59204193, 0.57225191],
       [0.22308163, 0.95274901],
       [0.44712538, 0.84640867],
       [0.69947928, 0.29743695],
       [0.81379782, 0.39650574]])

(n,1,p)-(1,k,p)是一个(n,k,p)的array

1 2	D = x[:,None,:]-c[None,:] D[0]

array([[-0.40626269, -0.03004969],
       [-0.40656432, -0.16624401],
       [-0.16906681,  0.39941372]])

D.shape

(10, 3, 2)

第三个维度被reduction

1	x_c_norm = np.linalg.norm(D,axis=2)

1	np.argmin(x_c_norm,axis=1)

array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

import numpy as np

k=3
a= np.array([0,1,2,3,4,5])
ans = a[:,None]+a[None,:]
ans[:len(a)-k+1,:k]

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4],
       [3, 4, 5]])

import numpy as np

x = np.arange(10)
x[np.logical_and(x>3,x<7)]

array([4, 5, 6])

1	x[np.logical_or(x>3,x<7)]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1 2	np.random.random((2,3))

array([[0.8754406 , 0.31234192, 0.21472602],
       [0.1417056 , 0.6179606 , 0.0532995 ]])

1 2	np.random.normal(0,1,20)

array([ 1.36952814,  0.39363222,  0.48644709, -0.23743297,  0.21279548,
       -0.5515374 , -2.22034906, -1.41651274,  0.08532262,  0.49804278,
       -2.03253663,  0.66368554,  0.28155033,  0.94521731,  0.5938768 ,
       -1.57368613, -1.55284648,  0.54146951, -0.75257585, -0.23976487])

1	np.random.uniform(0,1,(2,4))

array([[0.5909873 , 0.8235658 , 0.14703172, 0.94326283],
       [0.79080321, 0.34115745, 0.75083531, 0.31725324]])

x=np.arange(1,11)
for i in range(5):
    print(np.random.choice(x,5,False,x/float(sum(x))))

[ 6 10  7  9  5]
[ 9  8 10  7  6]
[ 2  8  9 10  5]
[10  9  8  3  6]
[7 8 6 5 9]

1
2
3

x = np.arange(10)
np.random.shuffle(x)
x

array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])

1	np.random.permutation(x) #x not change

array([1, 9, 5, 7, 3, 4, 2, 0, 6, 8])

array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])

1	np.sort(x)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Pandas

basic structures

Series
- One dimensional labeled array
DataFrame
- tabel of rows, with labeled columns

import numpy as np

a= np.array([0,1,2,3,4,5])
(a[:,None]+a[None,:])[:4,:3]

array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4],
       [3, 4, 5]])

1 2	import pandas as pd flight = pd.read_csv("10/flights.csv.gz")

`Series`

can create a pandas series from any array-like structure(dict,numpy array,list)
和numpy不同的是：有label
- 可以自己改label

import numpy as np
numbers = np.random.randn(5)
s = pd.Series(numbers)
s

0    1.556860
1    0.187127
2   -0.245997
3    1.342677
4   -0.117419
dtype: float64

1
2
3

idx = ['a','b','c','d','e']
s = pd.Series(numbers,index=idx)
s

a    1.556860
b    0.187127
c   -0.245997
d    1.342677
e   -0.117419
dtype: float64

providing too few or too many indices is a ValueError

1	pd.Series(np.random.rand(5),index=['a','b','c','d','e'])

a    0.996875
b    0.893754
c    0.925473
d    0.853753
e    0.500225
dtype: float64

can create a Series from a dict

1 2	d = {"dog":1,"cat":2} pd.Series(d)

dog    1
cat    2
dtype: int64

1	pd.Series(d,index=["dog","cat","pig"])

dog    1.0
cat    2.0
pig    NaN
dtype: float64

字典里没有pig，所以这里赋值给pig：NaN

inf==inf+X

np.nan不等于np.nan

用np.isnan()判断

或者用变量不等于自身判断nan，这是一个变量不等于自身的特例

s[0]

1.5568595147700188

s[-1]

-0.11741940757185318

s[:3]

a    1.556860
b    0.187127
c   -0.245997
dtype: float64

s['a']

1.5568595147700188

这里需要注意的是：用:以及index名字分割数组时，:后面的也会包含进去。但是数字不会,哪怕索引名字是数字。

1	s['a':'c']

a    1.556860
b    0.187127
c   -0.245997
dtype: float64

s[0:3]

a    1.556860
b    0.187127
c   -0.245997
dtype: float64

1	np.exp(s[:3])

a    4.743900
b    1.205781
c    0.781924
dtype: float64

indices need not be unique

1 2	s = pd.Series([2,3,5,7,11],index=['a','a','a','c','d']) s

a     2
a     3
a     5
c     7
d    11
dtype: int64

s['a']

a    2
a    3
a    5
dtype: int64

s[0]

1	s['a':'c']

a    2
a    3
a    5
c    7
dtype: int64

s[0:3]

a    2
a    3
a    5
dtype: int64

和np一样，可以用boolean series索引

s[s>5]

c     7
d    11
dtype: int64

s>5

a    False
a    False
a    False
c     True
d     True
dtype: bool

Series objects are dict-like, in that we can access and update entries via their keys.

Like dictionary, accessing a non-existent key is a KeyError.

Series also support in operator.

also support get method in dictionary

entries in Series can be mix type, but no sequence

1 2	s.name = "random int" s

a     2
a     3
a     5
c     7
d    11
Name: random int, dtype: int64

1 2	s = pd.Series(['dog','fish','elephant']) s.map(lambda s:len(s))

0    3
1    4
2    8
dtype: int64

1	s.map(len)

0    3
1    4
2    8
dtype: int64

map in Series allows to change values based on another Series.

1 2	s = pd.Series(['fruit','animal','animal','fruit','fruit'],index=['apple','cat','goat','banana','kiwi']) s

apple      fruit
cat       animal
goat      animal
banana     fruit
kiwi       fruit
dtype: object

1 2	t = pd.Series({'fruit':0,'animal':1}) s.map(t)

apple     0
cat       1
goat      1
banana    0
kiwi      0
dtype: int64

DataFrame

each column can have its own index. DataFrame will have all the index. 缺失值NaN

1
2
3

d = {'A':pd.Series([1,2,3],index={'cat','dog','bird'}),'B':{'dog':1,'cat':2,'bird':3,'goat':4}}
df = pd.DataFrame(d)
df

	A	B
bird	2.0	3
cat	3.0	2
dog	1.0	1
goat	NaN	4

df['A']

bird    2.0
cat     3.0
dog     1.0
goat    NaN
Name: A, dtype: float64

By default: rows and columns are ordered alphabetically

1	df.columns

Index(['A', 'B'], dtype='object')

df.index

Index(['bird', 'cat', 'dog', 'goat'], dtype='object')

1	df['C']=df['A']*2

df

	A	B	C
bird	2.0	3	4.0
cat	3.0	2	6.0
dog	1.0	1	2.0
goat	NaN	4	NaN

1	flight['max_delay']=list(map(max,flight['arr_delay'],flight['dep_delay']))

1 2	v = np.array([1,2,3,4,5]) np.maximum(v,v*2)

array([ 2,  4,  6,  8, 10])

np.max(v)

1	flight['max_delay'] = np.maximum(flight['arr_delay'],flight['dep_delay'])

1 2	flight['num']=0 # Broadcast flight['num']

0         0
1         0
2         0
3         0
4         0
         ..
336771    0
336772    0
336773    0
336774    0
336775    0
Name: num, Length: 336776, dtype: int64

1 2	del flight['num'] flight.columns

Index(['year', 'month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay',
       'arr_time', 'sched_arr_time', 'arr_delay', 'carrier', 'flight',
       'tailnum', 'origin', 'dest', 'air_time', 'distance', 'hour', 'minute',
       'time_hour', 'max_delay'],
      dtype='object')

1	df.loc['cat'] # select rows by labels

A    3.0
B    2.0
Name: cat, dtype: float64

1	df.iloc[1] # select rows by their integer labels(starting from 0)

A    3.0
B    2.0
Name: cat, dtype: float64

1	df[1:3] # rows

	A	B
cat	3.0	2
dog	1.0	1

1	df[df['A']>2]

	A	B
cat	3.0	2

1	df.iloc[:2,:1] # 这里无法去掉iloc

	A
bird	2.0
cat	3.0

1	df.loc['cat':'dog','A']

cat    3.0
dog    1.0
Name: A, dtype: float64

1
2
3

df1 = pd.DataFrame(np.random.randn(8,4),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.random.randn(5,3),columns=['a','b','c'])
df1+df2

	a	b	c	d
0	0.150499	-0.084967	-1.304810	NaN
1	-1.723753	0.809527	-3.027434	NaN
2	-1.770957	-1.115960	-0.606153	NaN
3	-1.322787	-0.154228	0.316819	NaN
4	0.357720	2.129364	2.252070	NaN
5	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN
7	NaN	NaN	NaN	NaN

1	df2-df2.iloc[0] # Broadcast

	a	b	c
0	0.000000	0.000000	0.000000
1	-0.406540	-0.015429	-0.683493
2	-0.095646	-1.092577	1.542312
3	-0.462943	-0.349725	1.856521
4	0.749604	1.157111	1.999972

1	(df1>0.5).any()

a    True
b    True
c    True
d    True
dtype: bool

1	(df1>0.5).all()

a    False
b    False
c    False
d    False
dtype: bool

1	df1.T*df1.T

	0	1	2	3	4	5	6	7
a	0.039975	1.607250	2.643456	0.656756	0.117268	0.264463	0.471706	1.462087
b	0.012506	0.636951	0.002525	0.028437	0.893762	2.970262	0.084393	3.932983
c	0.002420	1.184460	0.797188	0.080707	2.273190	0.834139	2.697453	0.488765
d	0.650234	0.199167	0.196074	0.403005	0.352569	1.245693	0.628392	0.021431

1	df1.T.dot(df1) # numpy-like

	a	b	c	d
a	7.262961	-4.896682	2.348484	-2.077213
b	-4.896682	8.561819	3.042636	2.741910
c	2.348484	3.042636	8.358324	-0.150733
d	-2.077213	2.741910	-0.150733	3.696565

df1.T@df1

	a	b	c	d
a	7.262961	-4.896682	2.348484	-2.077213
b	-4.896682	8.561819	3.042636	2.741910
c	2.348484	3.042636	8.358324	-0.150733
d	-2.077213	2.741910	-0.150733	3.696565

1	df.dropna(axis=1,how='any') # drop columns

	B
bird	3
cat	2
dog	1
goat	4

1	df.dropna() # drop rows

	A	B
bird	2.0	3
cat	3.0	2
dog	1.0	1

1	df.dropna(axis=1,how="all") # drop columns with all missing data

	A	B
bird	2.0	3
cat	3.0	2
dog	1.0	1
goat	NaN	4

pandas supports read/write for a wide range of different file format

1	flight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute          336776 non-null  int64  
 18  time_hour       336776 non-null  object 
 19  max_delay       327346 non-null  float64
dtypes: float64(6), int64(9), object(5)
memory usage: 51.4+ MB

1	flight.head()

	year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	carrier	flight	tailnum	origin	dest	air_time	distance	hour	minute	time_hour	max_delay
0	2013	1	1	517.0	515	2.0	830.0	819	11.0	UA	1545	N14228	EWR	IAH	227.0	1400	5	15	2013-01-01T10:00:00Z	11.0
1	2013	1	1	533.0	529	4.0	850.0	830	20.0	UA	1714	N24211	LGA	IAH	227.0	1416	5	29	2013-01-01T10:00:00Z	20.0
2	2013	1	1	542.0	540	2.0	923.0	850	33.0	AA	1141	N619AA	JFK	MIA	160.0	1089	5	40	2013-01-01T10:00:00Z	33.0
3	2013	1	1	544.0	545	-1.0	1004.0	1022	-18.0	B6	725	N804JB	JFK	BQN	183.0	1576	5	45	2013-01-01T10:00:00Z	-1.0
4	2013	1	1	554.0	600	-6.0	812.0	837	-25.0	DL	461	N668DN	LGA	ATL	116.0	762	6	0	2013-01-01T11:00:00Z	-6.0

1	flight.tail()

	year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	carrier	flight	tailnum	origin	dest	air_time	distance	hour	minute	time_hour	max_delay
336771	2013	9	30	NaN	1455	NaN	NaN	1634	NaN	9E	3393	NaN	JFK	DCA	NaN	213	14	55	2013-09-30T18:00:00Z	NaN
336772	2013	9	30	NaN	2200	NaN	NaN	2312	NaN	9E	3525	NaN	LGA	SYR	NaN	198	22	0	2013-10-01T02:00:00Z	NaN
336773	2013	9	30	NaN	1210	NaN	NaN	1330	NaN	MQ	3461	N535MQ	LGA	BNA	NaN	764	12	10	2013-09-30T16:00:00Z	NaN
336774	2013	9	30	NaN	1159	NaN	NaN	1344	NaN	MQ	3572	N511MQ	LGA	CLE	NaN	419	11	59	2013-09-30T15:00:00Z	NaN
336775	2013	9	30	NaN	840	NaN	NaN	1020	NaN	MQ	3531	N839MQ	LGA	RDU	NaN	431	8	40	2013-09-30T12:00:00Z	NaN

1
2
3

df1 = df*2
df2 = df+df
(df1==df2).all() # 有NaN，判断相等时要小心

A    False
B     True
dtype: bool

1	np.nan==np.nan

False

1	df1.equals(df2)

True

1	df1.mean(0) # mean of colums

A    4.0
B    5.0
dtype: float64

1	df1.mean(1) # mean of rows

bird    5.0
cat     5.0
dog     2.0
goat    8.0
dtype: float64

1	np.mean(df1) # skip na, 也可以设置skipna=False

A    4.0
B    5.0
dtype: float64

np.mean作用于dataframe时，自动跳过nan，但是on np.array时，返回nan

1	flight.describe()

	year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	flight	air_time	distance	hour	minute	max_delay
count	336776.0	336776.000000	336776.000000	328521.000000	336776.000000	328521.000000	328063.000000	336776.000000	327346.000000	336776.000000	327346.000000	336776.000000	336776.000000	336776.000000	327346.000000
mean	2013.0	6.548510	15.710787	1349.109947	1344.254840	12.639070	1502.054999	1536.380220	6.895377	1971.923620	150.686460	1039.912604	13.180247	26.230100	16.969943
std	0.0	3.414457	8.768607	488.281791	467.335756	40.210061	533.264132	497.457142	44.633292	1632.471938	93.688305	733.233033	4.661316	19.300846	42.376455
min	2013.0	1.000000	1.000000	1.000000	106.000000	-43.000000	1.000000	1.000000	-86.000000	1.000000	20.000000	17.000000	1.000000	0.000000	-33.000000
25%	2013.0	4.000000	8.000000	907.000000	906.000000	-5.000000	1104.000000	1124.000000	-17.000000	553.000000	82.000000	502.000000	9.000000	8.000000	-4.000000
50%	2013.0	7.000000	16.000000	1401.000000	1359.000000	-2.000000	1535.000000	1556.000000	-5.000000	1496.000000	129.000000	872.000000	13.000000	29.000000	1.000000
75%	2013.0	10.000000	23.000000	1744.000000	1729.000000	11.000000	1940.000000	1945.000000	14.000000	3465.000000	192.000000	1389.000000	17.000000	44.000000	19.000000
max	2013.0	12.000000	31.000000	2400.000000	2359.000000	1301.000000	2400.000000	2359.000000	1272.000000	8500.000000	695.000000	4983.000000	23.000000	59.000000	1301.000000

`pipe` method: method chaining

f = lambda x : x**2
g = lambda x : x+1
h = lambda x : 2*x
df = pd.DataFrame(np.random.randn(5,3),columns=['A','B','C'])
df

	A	B	C
0	-0.413218	0.348975	-1.313533
1	1.658054	0.495508	2.021334
2	-1.136187	0.002771	-1.651880
3	-1.125284	0.205869	-0.670324
4	1.085758	-0.103062	2.407278

1	h(g(f(df)))

	A	B	C
0	2.341498	2.243567	5.450740
1	7.498284	2.491057	10.171579
2	4.581844	2.000015	7.457418
3	4.532527	2.084764	2.898668
4	4.357740	2.021243	13.589975

1	df.pipe(f).pipe(g).pipe(h)

	A	B	C
0	2.341498	2.243567	5.450740
1	7.498284	2.491057	10.171579
2	4.581844	2.000015	7.457418
3	4.532527	2.084764	2.898668
4	4.357740	2.021243	13.589975

row- and column-wise function: `apply()`

1	flight.apply()

Pandas II

1 2	import pandas as pd import numpy as np

1 2	s = pd.Series(np.random.randn(8)) s.pct_change()

0         NaN
1   -3.496694
2   -1.735719
3   -0.905374
4    2.515764
5   -1.755356
6    0.058851
7    2.335997
dtype: float64

1
2
3

s1 = pd.Series(np.random.randn(100))
s2 = pd.Series(0.1*s1+np.random.randn(100))
s1.cov(s2)

0.2999112016156914

1 2	s = pd.Series(np.random.randn(5),index=list('abcde')) s

a   -0.709341
b    0.346186
c    0.136598
d   -0.826597
e   -1.181088
dtype: float64

s.rank()

a    3.0
b    5.0
c    4.0
d    2.0
e    1.0
dtype: float64

1 2	s[0]=s[1]=0 s.rank()

a    3.5
b    3.5
c    5.0
d    2.0
e    1.0
dtype: float64

1 2	tsdf = pd.DataFrame(np.random.randn(10,3),columns=['DOW','NASDAQ','SP500'],index=pd.date_range('1/1/2000',periods=10)) tsdf

	DOW	NASDAQ	SP500
2000-01-01	0.656688	-1.531741	-0.024392
2000-01-02	-0.706209	0.325865	-0.329847
2000-01-03	-0.127025	0.806435	-0.910426
2000-01-04	-0.401866	0.191501	-0.204383
2000-01-05	-1.283493	1.175840	-1.195107
2000-01-06	0.467726	-0.255641	0.056299
2000-01-07	-0.371994	0.140242	-0.650468
2000-01-08	0.818794	0.461664	1.267799
2000-01-09	2.121635	1.405991	-0.179316
2000-01-10	-1.062810	0.308460	1.338484

apply to each column

1	tsdf.agg([np.median,np.mean,np.std])

	DOW	NASDAQ	SP500
median	-0.249509	0.317162	-0.191849
mean	0.011145	0.302862	-0.083136
std	1.022643	0.814786	0.829758

agg can take dict as functions

1 2	tsdf.agg({'DOW':'mean','NASDAQ':'median','SP500':'max'})

DOW       0.011145
NASDAQ    0.317162
SP500     1.338484
dtype: float64

1	tsdf.corr(method='spearman')

	DOW	NASDAQ	SP500
DOW	1.000000	-0.018182	0.369697
NASDAQ	-0.018182	1.000000	-0.345455
SP500	0.369697	-0.345455	1.000000

1	tsdf.corr(method='pearson')

	DOW	NASDAQ	SP500
DOW	1.000000	-0.012922	0.191341
NASDAQ	-0.012922	1.000000	-0.222936
SP500	0.191341	-0.222936	1.000000

dataframe ranks columns(default)

1	tsdf.rank()

	DOW	NASDAQ	SP500
2000-01-01	8.0	1.0	7.0
2000-01-02	3.0	6.0	4.0
2000-01-03	6.0	8.0	2.0
2000-01-04	4.0	4.0	5.0
2000-01-05	1.0	9.0	1.0
2000-01-06	7.0	2.0	8.0
2000-01-07	5.0	3.0	3.0
2000-01-08	9.0	7.0	9.0
2000-01-09	10.0	10.0	6.0
2000-01-10	2.0	5.0	10.0

1	tsdf.rank(1)

	DOW	NASDAQ	SP500
2000-01-01	3.0	1.0	2.0
2000-01-02	1.0	3.0	2.0
2000-01-03	2.0	3.0	1.0
2000-01-04	1.0	3.0	2.0
2000-01-05	1.0	3.0	2.0
2000-01-06	3.0	1.0	2.0
2000-01-07	2.0	3.0	1.0
2000-01-08	2.0	1.0	3.0
2000-01-09	3.0	2.0	1.0
2000-01-10	1.0	2.0	3.0

iterating over Series get an iterator on values

iterating over dataframe get an iterator over column names

df.iteritems
df.iterrows

pandas,Group By

1 2	df = pd.DataFrame({'A':['plant','animal','plant','plant'],'B':['apple','goat','kiwi','grape'],'C':np.random.randn(4),'D':np.random.randn(4)}) df

	A	B	C	D
0	plant	apple	2.392546	-0.554108
1	animal	goat	-1.292467	0.800007
2	plant	kiwi	0.718886	0.600089
3	plant	grape	-0.665115	-1.205696

1	df.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000155D469CD00>

1	df.groupby('A').groups

{'animal': [1], 'plant': [0, 2, 3]}

1	df.groupby('A').mean()

	C	D
A
animal	-1.292467	0.800007
plant	0.815439	-0.386571

1
2
3

index = pd.date_range('10/1/1999',periods=1100)
ts = pd.Series(np.random.normal(0.5,2,1100),index)
ts.head()

1999-10-01    1.318674
1999-10-02    2.784437
1999-10-03    4.774021
1999-10-04    0.017314
1999-10-05    2.678237
Freq: D, dtype: float64

key = lambda d: d.year
zscore = lambda x: (x-x.mean())/x.std()
transformed = ts.groupby(key).transform(zscore)
transformed.head()

1999-10-01    0.409280
1999-10-02    1.168705
1999-10-03    2.199526
1999-10-04   -0.264966
1999-10-05    1.113682
Freq: D, dtype: float64

filter return ungrouped resulted

1 2	sf = pd.Series([1,1,2,2,3,3]) sf.groupby(sf).filter(lambda x: x.sum()>2)

2    2
3    2
4    3
5    3
dtype: int64

df.groupby().agg()
df.groupby().transform()

df.concat
df.pivot

np.random.seed(0)

df4 = pd.DataFrame({
    "stock":['A', 'B', 'C', 'A', 'B','C','A','C','A','B','C','A','B','C','D','D','D','D','A','B','C','D'],
    "date":['20210104','20210104','20210104','20210102','20210102','20210102',
            '20210103', '20210103', '20210101','20210101','20210101',
            '20210105','20210105','20210105','20210102','20210103','20210104','20210105', 
            '20210106', '20210106', '20210106','20210106'],
    "daily_ret":np.random.normal(0,0.05,22)})
df4_stacked = df4.pivot(index='date',columns='stock',values='daily_ret')
df4_stacked

stock	A	B	C	D
date
20210101	-0.005161	0.020530	0.007202	NaN
20210102	0.112045	0.093378	-0.048864	0.022193
20210103	0.047504	NaN	-0.007568	0.016684
20210104	0.088203	0.020008	0.048937	0.074704
20210105	0.072714	0.038052	0.006084	-0.010258
20210106	0.015653	-0.042705	-0.127649	0.032681

1 2	s = df4_stacked.stack() s

date      stock
20210101  A       -0.005161
          B        0.020530
          C        0.007202
20210102  A        0.112045
          B        0.093378
          C       -0.048864
          D        0.022193
20210103  A        0.047504
          C       -0.007568
          D        0.016684
20210104  A        0.088203
          B        0.020008
          C        0.048937
          D        0.074704
20210105  A        0.072714
          B        0.038052
          C        0.006084
          D       -0.010258
20210106  A        0.015653
          B       -0.042705
          C       -0.127649
          D        0.032681
dtype: float64

1 2	s['20210104','A'] s['20210104']['A']

0.0882026172983832

matplotlib

import matplotlib.pyplot as plt
%matplotlib inline
x = np.arange(10)
_ = plt.plot(x**2)

png

1 2	import matplotlib as mpl mpl.style.available

['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10']

1 2	mpl.style.use('ggplot') mpl.style.use('seaborn-white')

1
2
3

%matplotlib inline
x = np.arange(10)
_ = plt.plot(x**2)

png

1
2
3

t = np.arange(10)
s = np.random.normal(1,1,10)
_ = plt.bar(t,s,align='center')

png

1
2
3

fig,axs = plt.subplots(nrows=3,ncols=3)
x = np.arange(0,100)
axs[0][0].plot(x,x**2)

[<matplotlib.lines.Line2D at 0x155d7666eb0>]

png

Regular Expressions and Text Encoding

Unicode ASCII sebset of UTF-8

re.match() at start of string
re.search() any part of string
re.findall() find all matches of parttern in the string
https://rex101.com

1 2	import re re.match('cat','cats and dogs')

<re.Match object; span=(0, 3), match='cat'>

1 2	regex = 'cat' re.match(regex,'fat on mat') is None

True

if re.match(regex,'fat on mat'):
    print('matches')
else:
    print('Not')

Not

1	re.search('cat','rains cats and dogs')

<re.Match object; span=(6, 9), match='cat'>

1 2	if re.search('cat','rains cats and dogs'): print('matches')

matches

1	re.findall('cat','rains cats and cat')

['cat', 'cat']

1	re.match('\\\\\$','$')

1	re.match('\\\\\$','\$')

<re.Match object; span=(0, 2), match='\\$'>

1	re.match('\$','$')

<re.Match object; span=(0, 1), match='$'>

raw text
- is new line
- r"" == "\n", raw text
推荐在用字符串的时候都加上r

1	print(r"\n1.00","\n1.00")

\n1.00 
1.00

1	re.match('$','$')

1	re.match('\$','$')

<re.Match object; span=(0, 1), match='$'>

1	re.match(r'\$','$')

<re.Match object; span=(0, 1), match='$'>

1	re.findall("[aeiou]","cats and dog")

['a', 'a', 'o']

1	re.findall("[a-z ]","cats and Dog")

['c', 'a', 't', 's', ' ', 'a', 'n', 'd', ' ', 'o', 'g']

1	re.findall("[a-z]","cats and Dog")

['c', 'a', 't', 's', 'a', 'n', 'd', 'o', 'g']

1	re.findall("[A-Z]","cats and Dog")

['D']

1	re.findall("[a-zA-Z]","cats and Dog")

['c', 'a', 't', 's', 'a', 'n', 'd', 'D', 'o', 'g']

1	re.findall("[^a-zA-Z]","cats and Dog") # not in the list

[' ', ' ']

1	re.findall("[a\-z]","i am a z-ip")

['a', 'a', 'z', '-']

1	re.findall("[(+)^]","1+1+2^1=3(1)")

['+', '+', '^', '*', '(', ')']

1	re.findall("[^(+)^]","1+1+2^1=3(1)")

['1', '1', '2', '1', '=', '3', '1']

: space, tab, new line, = [ ]
r"" = r"[a-zA-Z_]"
digits = r"[0-9]"
"^": not in the [], start of a line
new line 开头有个invisible ^, end line have invisible $
"$": end of a line
".": including everything
只想匹配句号的话用："."
"o..r" match "o任意两个字母r"
word boundary: a 以a结尾的单词
如果想neglect上述，只需要大写，例如：, neglect all the words
+: "e+" match any number of e's
: "e"

1	re.findall("oe+","oeuvree")

['oe']

1	re.findall("[oe]+","oeuvree")

['oe', 'ee']

1	re.findall("[whe]*","while wheeee weheee")

['wh', '', '', 'e', '', 'wheeee', '', 'weheee', '']

‘*’ : zero or more of the previous item

‘+’ : one or more of the previous item

‘?’ : zero or one of the previous item

‘{4}’ : exactly four of the previous item

‘{3,}’ : three or more of previous item

‘{2,5}’ : between two and five (inclusive) of previous item

1	re.findall("whe*","while wheeee weheee heee weee")

['wh', 'wheeee']

‘|’ (“pipe”) is a special character that allows one to specify “or” clauses

Example: I want to match the word “cat” or the word “dog”

Solution: ‘(cat|dog)’ 记得加括号！！！

Note: parentheses are not strictly necessary here, but parentheses tend to make for easier reading and avoid possible ambiguity. It’s a good habit to just use them always.

1	re.findall(r"a\|aa\|aaa","aaaa")

['a', 'a', 'a', 'a']

1	re.findall(r"a+","aaaa")

['aaaa']

'?' modifies operators like '+' and '*' not be greedy,get lazy matching

1	re.findall(r"a+?","aaaa")

['a', 'a', 'a', 'a']

1	re.findall("\w+@\w+\.\w+","your email is kobe@icloud.com, his email is jacob@umich.edu")

['kobe@icloud.com', 'jacob@umich.edu']

capture group ():

(+@+.+)

regex = r'(\w+@\w+\.\w+)'
s = "your email is kobe@icloud.com kobe@icloud.com his email is jacob@umich.edu"
m=re.findall(regex,s)
m

['kobe@icloud.com', 'kobe@icloud.com', 'jacob@umich.edu']

backreferences
- in paranthesis() occur once again
  - ([a-zA-Z]+).*\1 首尾以相同字段开始和结束，中间任意

regex = r'(\w+)\1'
s = "freefree"
m = re.search(regex,s)
m.group(1)

'free'

regex = r'\w+'
s = "nonrepeat"
m = re.search(regex,s)
m.group(0)

'nonrepeat'

SQL

每列一个type
key: uniquely identify a record
sqliteonline.com
features:
- atomicity
- consistency
- isolation
- durability
https://www.w3schools.com/sql/
select
- SELECT * FROM countries
- SELECT id, name FROM countries
- SELECT * FROM countries WHERE [filter]
- <> !=
- = ==
- SELECT * FROM countries WHERE [] BETWEEN 86 AND 90
- SELECT * FROM countries WHERE [filter] IS NULL
  - Note: 空格不会match NULL
SORT
- ORDER BY
  - SELECT [columns] FROM [table] ORDER BY [column] [ASC|DESC]
  - SELECT [columns] FROM countries ORDER BY phonecode ASC, name DESC
  - 等价于 SELECT [columns] FROM countries ORDER BY phonecode, name DESC
  - 默认ASC
DISTINCT
- SELECT DISTINCT [columns] FROM [table]
WHERE
- 支持set
  - SELECT id from t_student WHERE major IN ("Mathematics","statistics")
  - SELECT id from t_student WHERE major NOT IN ("Mathematics","statistics")
- 支持regex-like matching
  - SELCET id from t_student WHERE first_name LIKE "M%"
  - %: match any number of letters
  - _: match one number of letters
GROUP BY
- SELECT customer_id, SUM(dollar_amount) FROM t_transactions GROUP BY customer_id
- COUNT, AVG, MIN, MAX: aggregate function
- SECLECT phonecode, COUNT() FROM countries GROUP BY phonecode
- HAVING: filer results after GROUP BY using the HAVING keyword
  - SELECT customer_id, SUM(dollar_amount) AS total_dolar FROM t_transactions GROUP BY customer_id HAVING total_dolar >50
JOIN: merging tables
- INNER JOIN
  - SELECT id, name, color FROM t_student INNER JOIN t_personal ON t_students.id=t_personal.id
- LEFT JOIN
- RIGHT JOIN
- FULL JOIN
INSERT INTO
- INSERT INTO table_name [col1, col2, col3, …] VALUES value1, value2, value3, …
UPDATE
- UPDATE table_name SET col1=value1,col2=value2, WHERE condition
- Caution: if WHERE is left empty, you'll delete/modify the whole tanle!
DELETE
- DELETE FROM table_name WHERE condition
CREATE TABLE
- CREATE TABLE table_name [col1 datatype, col2 datatype, …]
DROP TABLE
- DROP TABLE table_name;
SELECT * FROM sqlite_master
SELECT * FROM sqlite_master WHERE type="table"

课件里的python例子

1
2
3

import sqlite3
con = sqlite3.connect('ps8/flights.sqlite')
cur = con.cursor()

con

<sqlite3.Connection at 0x2615592b040>

1 2	res = cur.execute("select * from flights limit 10") # head 10 of table res

<sqlite3.Cursor at 0x2615594d3c0>

(Actually, res is just the original cursor itself):

1	res is cur

True

1 2	for row in res: print(row[0])

Fetching results

To retrieve results, use the cur.fetchone() or cur.fetchall() methods:

1
2
3

res = cur.execute("select * from flights limit 10") # head 10 of table

cur.fetchone()

(2013,
 1,
 1,
 517,
 515,
 2.0,
 830,
 819,
 11.0,
 'UA',
 1545,
 'N14228',
 'EWR',
 'IAH',
 227.0,
 1400.0,
 5.0,
 15.0,
 1357034400.0)

This returns a tuple containing one entry for each column in our SELECT statement.

If you want all the rows, use cur.fetchall():

It is an iterator

1	len(res.fetchall())

Notice that .fetchall() works like an iterator--we SELECTed 10 rows, fetched one of them using .fetchone(), and then fetched the remaining 9 using .fetchall(). If we run this again we'll get zero rows back because we already fetched everything:

1	len(res.fetchall())

Parameterized queries

Sometimes our queries must depend on user input. Consider the following query which returns all the flights that happened in a certain month:

def num_flights_in_month(month):
    query = """SELECT COUNT() FROM flights WHERE month = %d""" % month
    return cur.execute(query).fetchone()[0]

num_flights_in_month(1)

For queries that depend on a parameter, we can use the special placeholder ? to have the database do parameter substitution for us:

def num_flights_in_month(m):
    query = """SELECT COUNT() FROM flights WHERE month = ?"""
    return cur.execute(query, [m]).fetchone()[0]

num_flights_in_month(1)

Using multiple placeholders is possible:

def num_flights_on_day(m, d):
    query = """SELECT COUNT() FROM flights WHERE month = ? AND day = ?"""
    return cur.execute(query, [m, d]).fetchone()[0]

num_flights_on_day(12, 25)

Notice that, even if there is just one parameter, you have to pass in all the parameters as a list.

Why use placeholders

Consider the following table:

scon = sqlite3.connect(":memory:") 
scon.execute('CREATE TABLE users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT, age INT, password TEXT)')
scon.execute("""INSERT INTO users(username, age, password) VALUES 
('Jonathan', 40, 'beeri$g00d'),
('Sarah', 41, 'wat3rp0l0'),
('Julian', 2, 'il0vecr0ut0n3')
""")
scon.commit()

Here's our table:

1 2	import pandas as pd pd.read_sql_query("SELECT * from users", scon)

	id	username	age	password
0	1	Jonathan	40	beeri$g00d
1	2	Sarah	41	wat3rp0l0
2	3	Julian	2	il0vecr0ut0n3

Now let's say we have a function which takes a user-supplied parameter, their username, and returns their user information:

def user_info(username):
    q = "SELECT * FROM users WHERE username = '%s'" % username
    print(q)
    return pd.read_sql_query(q, scon)

user_info('Jonathan')

SELECT * FROM users WHERE username = 'Jonathan'

	id	username	age	password
0	1	Jonathan	40	beeri$g00d

This works great assuming that what got passed in is actually a username, but a malicious user could do the following:

1	user_info("""abc' or 'a'='a""")

SELECT * FROM users WHERE username = 'abc' or 'a'='a'

	id	username	age	password
0	1	Jonathan	40	beeri$g00d
1	2	Sarah	41	wat3rp0l0
2	3	Julian	2	il0vecr0ut0n3

This is known as an injection attack. For this reason, we do not passed user input directly into a query string. Instead we use placeholders, and rely on the SQL engine itself to perform the substitution:

def safe_user_info(username):
    q = "SELECT * FROM users WHERE username = ?"
    print(q)
    return pd.read_sql_query(q, scon, params=[username])

safe_user_info('Jonathan')

SELECT * FROM users WHERE username = ?

	id	username	age	password
0	1	Jonathan	40	beeri$g00d

1	safe_user_info("""abc' or 'a'='a""")

SELECT * FROM users WHERE username = ?

	id	username	age	password

Ten questions about `flights`

Now we will use SQL queries to various questions about the flights dataset.

Which airport is the busiest?

We need to aggregate flights by the origin column, and they count the number of rows in each group.

1	cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin").fetchall()

[('EWR', 120835), ('JFK', 111279), ('LGA', 104662)]

1	cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin").description

(('origin', None, None, None, None, None, None),
 ('COUNT()', None, None, None, None, None, None))

1	pd.read_sql_query("SELECT origin, COUNT() FROM flights GROUP BY origin",con)

	origin	COUNT()
0	EWR	120835
1	JFK	111279
2	LGA	104662

Which month was coldest?

We need to aggregate weather by month, and the compute some measure of coldness.

1	cur.execute("SELECT month, AVG(temp) FROM weather GROUP BY month").fetchall()

[(1, 35.63566037735852),
 (2, 34.27059701492533),
 (3, 39.880071845532164),
 (4, 51.745641500694774),
 (5, 61.79500000000004),
 (6, 72.18399999999997),
 (7, 80.06622082585272),
 (8, 74.4684657039714),
 (9, 67.3712922649374),
 (10, 60.07113019891505),
 (11, 44.9904343764596),
 (12, 38.44180037313427)]

What is the most popular carrier at each of the three NYC airports?

This one is a bit trickier. We start by counting the number of flights for each combination of carrier and origin:

1	cur.execute("SELECT COUNT(), carrier, origin FROM flights GROUP BY carrier, origin").fetchall()

[(1268, '9E', 'EWR'),
 (14651, '9E', 'JFK'),
 (2541, '9E', 'LGA'),
 (3487, 'AA', 'EWR'),
 (13783, 'AA', 'JFK'),
 (15459, 'AA', 'LGA'),
 (714, 'AS', 'EWR'),
 (6557, 'B6', 'EWR'),
 (42076, 'B6', 'JFK'),
 (6002, 'B6', 'LGA'),
 (4342, 'DL', 'EWR'),
 (20701, 'DL', 'JFK'),
 (23067, 'DL', 'LGA'),
 (43939, 'EV', 'EWR'),
 (1408, 'EV', 'JFK'),
 (8826, 'EV', 'LGA'),
 (685, 'F9', 'LGA'),
 (3260, 'FL', 'LGA'),
 (342, 'HA', 'JFK'),
 (2276, 'MQ', 'EWR'),
 (7193, 'MQ', 'JFK'),
 (16928, 'MQ', 'LGA'),
 (6, 'OO', 'EWR'),
 (26, 'OO', 'LGA'),
 (46087, 'UA', 'EWR'),
 (4534, 'UA', 'JFK'),
 (8044, 'UA', 'LGA'),
 (4405, 'US', 'EWR'),
 (2995, 'US', 'JFK'),
 (13136, 'US', 'LGA'),
 (1566, 'VX', 'EWR'),
 (3596, 'VX', 'JFK'),
 (6188, 'WN', 'EWR'),
 (6087, 'WN', 'LGA'),
 (601, 'YV', 'LGA')]

Next, we need to rank the carriers within origin by their counts.

cur.execute(
    """
    SELECT RANK() OVER (
        PARTITION BY origin
        ORDER BY n
    ) AS r, carrier, origin, n FROM
    (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
"""
).fetchall()

[(1, 'OO', 'EWR', 6),
 (2, 'AS', 'EWR', 714),
 (3, '9E', 'EWR', 1268),
 (4, 'VX', 'EWR', 1566),
 (5, 'MQ', 'EWR', 2276),
 (6, 'AA', 'EWR', 3487),
 (7, 'DL', 'EWR', 4342),
 (8, 'US', 'EWR', 4405),
 (9, 'WN', 'EWR', 6188),
 (10, 'B6', 'EWR', 6557),
 (11, 'EV', 'EWR', 43939),
 (12, 'UA', 'EWR', 46087),
 (1, 'HA', 'JFK', 342),
 (2, 'EV', 'JFK', 1408),
 (3, 'US', 'JFK', 2995),
 (4, 'VX', 'JFK', 3596),
 (5, 'UA', 'JFK', 4534),
 (6, 'MQ', 'JFK', 7193),
 (7, 'AA', 'JFK', 13783),
 (8, '9E', 'JFK', 14651),
 (9, 'DL', 'JFK', 20701),
 (10, 'B6', 'JFK', 42076),
 (1, 'OO', 'LGA', 26),
 (2, 'YV', 'LGA', 601),
 (3, 'F9', 'LGA', 685),
 (4, '9E', 'LGA', 2541),
 (5, 'FL', 'LGA', 3260),
 (6, 'B6', 'LGA', 6002),
 (7, 'WN', 'LGA', 6087),
 (8, 'UA', 'LGA', 8044),
 (9, 'EV', 'LGA', 8826),
 (10, 'US', 'LGA', 13136),
 (11, 'AA', 'LGA', 15459),
 (12, 'MQ', 'LGA', 16928),
 (13, 'DL', 'LGA', 23067)]



<IPython.core.display.Javascript object>

Finally, to find the most popular carriers, we can filter this table to only the rows where rank=1. To do this I will use a subquery:

SELECT * FROM (<another query>)

cur.execute("""
    SELECT * FROM (SELECT RANK() OVER (
        PARTITION BY origin
        ORDER BY n DESC
    ) AS r, carrier, origin, n FROM
    (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
    ) WHERE r = 1
""").fetchall()

[(1, 'UA', 'EWR', 46087), (1, 'B6', 'JFK', 42076), (1, 'DL', 'LGA', 23067)]

cur.execute("""
    SELECT * FROM (SELECT RANK() OVER (
        PARTITION BY origin
        ORDER BY n DESC
    ) AS r, carrier, origin, n FROM
    (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
    ) AS subq JOIN airlines ON airlines.carrier = subq.carrier
    WHERE r = 1 
""").fetchall()

[(1, 'B6', 'JFK', 42076, 'B6', 'JetBlue Airways'),
 (1, 'DL', 'LGA', 23067, 'DL', 'Delta Air Lines Inc.'),
 (1, 'UA', 'EWR', 46087, 'UA', 'United Air Lines Inc.')]

What are the busiest travel weeks of the year?

We need to figure out the week of each flight, and then aggregate them. To manipulate dates in sqlite, we have the strftime() function:

1	cur.execute("""SELECT strftime('%W', 'now')""").fetchone()

('11',)

1	cur.execute("""SELECT strftime('The day of the week is %w, the week of the year is %W', 'now')""").fetchone()

('The day of the week is 0, the week of the year is 11',)

The Unix epoch

The time_hour column in flights looks like:

1	cur.execute("""SELECT time_hour FROM flights""").fetchone()

(1357034400.0,)

This number represents the number of seconds that have elapsed since midnight GMT on January 1, 1970, a date known as the Unix epoch. We can use strftime() on these dates provided we tell it how they are formatted.

1 2	# datetime() is an alias for strftime('%Y-%m-%d %H:%M:%S', ...) cur.execute("""SELECT datetime(1357034400, 'unixepoch')""").fetchone()

('2013-01-01 10:00:00',)

Now we can convert each date to a week of the year:

1	pd.read_sql_query("""SELECT year, month, day, strftime('%W', time_hour, 'unixepoch') from flights""", con)

	year	month	day	strftime('%W', time_hour, 'unixepoch')
0	2013	1	1	00
1	2013	1	1	00
2	2013	1	1	00
3	2013	1	1	00
4	2013	1	1	00
...	...	...	...	...
336771	2013	9	30	39
336772	2013	9	30	39
336773	2013	9	30	39
336774	2013	9	30	39
336775	2013	9	30	39

336776 rows × 4 columns

The final query looks like:

con.execute("""
    SELECT COUNT() as n, strftime('%W', time_hour, 'unixepoch') as week FROM flights
    GROUP BY week
    ORDER BY n DESC
""").fetchall()[:10]

[(6760, '27'),
 (6740, '29'),
 (6737, '28'),
 (6735, '30'),
 (6733, '31'),
 (6702, '25'),
 (6687, '32'),
 (6675, '24'),
 (6639, '23'),
 (6631, '33')]

What where the ten worst days for average departure delay?

con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay
            FROM flights GROUP BY year, month, day 
            ORDER BY avg_delay DESC
            LIMIT 10''').fetchall()

[(2013, 3, 8, 83.5369211514393),
 (2013, 7, 1, 56.23382519863791),
 (2013, 9, 2, 53.029550827423165),
 (2013, 7, 10, 52.860702151755376),
 (2013, 12, 5, 52.32799013563502),
 (2013, 5, 23, 51.14471968709257),
 (2013, 9, 12, 49.95875),
 (2013, 6, 28, 48.827784156142364),
 (2013, 6, 24, 47.15741789354473),
 (2013, 7, 22, 46.66704675028506)]

What where the ten best days for departure delay?

con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay
            FROM flights GROUP BY year, month, day 
            ORDER BY avg_delay ASC
            LIMIT 10''').fetchall()

[(2013, 9, 24, -1.3298319327731092),
 (2013, 10, 2, -0.919917864476386),
 (2013, 11, 9, -0.6288515406162465),
 (2013, 9, 6, -0.39792746113989635),
 (2013, 9, 5, -0.3877973112719752),
 (2013, 10, 29, -0.34932221063607927),
 (2013, 9, 7, -0.25036390101892286),
 (2013, 11, 5, -0.18314255983350677),
 (2013, 10, 19, -0.10703812316715543),
 (2013, 10, 1, -0.09895833333333333)]

What airlines have the most departure delay?

con.execute('''SELECT airlines.carrier, name, AVG(dep_delay) AS avg_delay
            FROM flights
            JOIN airlines on airlines.carrier=flights.carrier
            GROUP BY airlines.name
            ORDER BY avg_delay DESC
            LIMIT 10''').fetchall()

[('F9', 'Frontier Airlines Inc.', 20.215542521994134),
 ('EV', 'ExpressJet Airlines Inc.', 19.955389827868213),
 ('YV', 'Mesa Airlines Inc.', 18.996330275229358),
 ('FL', 'AirTran Airways Corporation', 18.72607467838092),
 ('WN', 'Southwest Airlines Co.', 17.71174377224199),
 ('9E', 'Endeavor Air Inc.', 16.725769407441433),
 ('B6', 'JetBlue Airways', 13.022522106740018),
 ('VX', 'Virgin America', 12.869421165464821),
 ('OO', 'SkyWest Airlines Inc.', 12.586206896551724),
 ('UA', 'United Air Lines Inc.', 12.106072888459614)]

What airplane flew the most days?

Airplanes are identified by their tail number.

cur.execute('''SELECT tailnum, COUNT() AS n FROM
    (SELECT DISTINCT year, month, day, tailnum FROM flights)
    GROUP BY tailnum
    ORDER BY n DESC
    LIMIT 1
''').fetchall()

[('N328AA', 336)]

Interestingly, you can Google most tail numbers: N328AA.

How many flights were cancelled?

We'll say that a cancelled flight has dep_time missing.

1	cur.execute('''SELECT COUNT() FROM flights WHERE dep_time IS NULL''').fetchone()

(0,)

Notice that we don't write = NULL. That does not work:

1	cur.execute('''SELECT COUNT() FROM flights WHERE dep_time = NULL''').fetchone()

(0,)

How many flights were destined for airports with a one word name?

(Not including the word "airport")

Here is an example of an airport with a one-word name:

1	cur.execute("SELECT * FROM airports WHERE name = 'Packwood'").description

(('faa', None, None, None, None, None, None),
 ('name', None, None, None, None, None, None),
 ('lat', None, None, None, None, None, None),
 ('lon', None, None, None, None, None, None),
 ('alt', None, None, None, None, None, None),
 ('tz', None, None, None, None, None, None),
 ('dst', None, None, None, None, None, None),
 ('tzone', None, None, None, None, None, None))



<IPython.core.display.Javascript object>

CAST(xxx AS INTEGER)

cur.execute(
    """
    SELECT name
    FROM airports WHERE name NOT LIKE '% %'
    """
).fetchall()

[('Packwood',),
 ('Florence',),
 ('Arcata',),
 ('Kodiak',),
 ('Addison',),
 ('Alexandria',),
 ('Centennial',),
 ('Lakeland',),
 ('Appleton',),
 ('Executive',),
 ('Avalon',),
 ('Kalamazoo',),
 ('Bethel',),
 ('Beaufort',),
 ('Bettles',),
 ('KBWD',),
 ('CedarKey',),
 ('Clemson',),
 ('Charlottesville-Albemarle',),
 ('Chehalis-Centralia',),
 ('Champaign',),
 ('Yeager',),
 ('Tri-Cities',),
 ('Decatur',),
 ('Dillingham',),
 ('Dupage',),
 ('Destin',),
 ('Unalaska',),
 ('Orlando',),
 ('Bessemer',),
 ('Wendover',),
 ('Franklin',),
 ('Tipton',),
 ('Gulkana',),
 ('Gulfport-Biloxi',),
 ('Majors',),
 ('Dillingham',),
 ('Hana',),
 ('Homer',),
 ('Iliamna',),
 ('Immokalee',),
 ('Kapalua',),
 ('Logan-Cache',),
 ('Lihue',),
 ('Lincoln',),
 ('Lanai',),
 ('Castle',),
 ('Mojave',),
 ('Downtown',),
 ('Molokai',),
 ('Manassas',),
 ('MariposaYosemite',),
 ('Lakefront',),
 ('County',),
 ('Kahului',),
 ('Nome',),
 ('Executive',),
 ('Northway',),
 ('Platinum',),
 ('Renton',),
 ('Rochester',),
 ('Prosser',),
 ('Deadhorse',),
 ('Sedona',),
 ('Gillespie',),
 ('Teterboro',),
 ('Telluride',),
 ('Talkeetna',),
 ('Toledo',),
 ('Upolu',),
 ('Apopka',),
 ('Yakutat',),
 ('Clear',)]



<IPython.core.display.Javascript object>

cur.execute(
    """
    SELECT COUNT() FROM flights
    INNER JOIN (SELECT faa FROM airports WHERE name NOT LIKE '% %') one_word
    ON one_word.faa = flights.dest
    """
).fetchall()

[(190,)]



<IPython.core.display.Javascript object>