Programming in Python (2)

Numpy

  • Built on Numpy:
    • SciPy
    • Pandas
    • matplotlib
    • Tensorflow
    • Scikit Learn
    • PyTorch
    • JAX
  • Numpy quick start guide: https://numpy.org/doc/stable/user/quickstart.html
  • closely related package scipy is for opimization
  • Basic numerical data types:
    • bool
    • int
    • uint
    • float
    • complex
    • https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html

numpy.array

  • https://docs.scipy.org/doc/numpy-1.10.1/user/basics.creation.html
  • compared to list:
    • faster
    • operation easy
1
2
3
4
import numpy as np

np.array([1,2,3],dtype='uint')

array([1, 2, 3], dtype=uint32)
1
np.zeros((2,3))
array([[0., 0., 0.],
       [0., 0., 0.]])
1
np.arange(2,3,0.1,dtype='float')
array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])

小心大整数会导致数据溢出,不同机器结果不一样

1
2
3
import numpy as np
a = np.arange(100_000_000,dtype='int64')
np.sum(a)
4999999950000000

empty 有时候会有随机结果,容易出错

1
np.empty((2,3))
array([[0., 0., 0.],
       [0., 0., 0.]])

arange()

1
np.arange(27).reshape(3,3,3)
array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])
1
2
a = np.arange(27).reshape(3,3,3)
a[0]
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])
1
np.arange(5,10)
array([5, 6, 7, 8, 9])
1
np.arange(0,1,0.1)
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

array indexing

1
2
x=np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
1
x[:-2]
array([0, 1, 2, 3, 4, 5, 6, 7])
1
x[1:7:2]
array([1, 3, 5])
1
x[::2]
array([0, 2, 4, 6, 8])
1
2
x = np.reshape(np.arange(1,13),(3,4))
x
array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])
1
x[1]
array([5, 6, 7, 8])
1
x[:,(1,3)]
array([[ 2,  4],
       [ 6,  8],
       [10, 12]])
1
x[(0,2),(1,3)] # 表示(0,1)和(2,3),第二项表示第几列,第一项表示第几行
array([ 2, 12])
1
x[x>7]
array([ 8,  9, 10, 11, 12])
1
x[(True,False,False)]
array([], shape=(0, 2, 5), dtype=int32)
  • index 组合可以用np.ix_
1
2
3
x = np.arange(9).reshape(3,3)
print(x)
x[np.ix_([0,2],[0,1])]
[[0 1 2]
 [3 4 5]
 [6 7 8]]





array([[0, 1],
       [6, 7]])

Example: minist

1
2
import numpy as np
minist = np.load('8/mnist.npz')
1
minist['images']
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)
1
list(minist) # get key
['images', 'labels']
1
minist['images'].ndim
2
1
minist['images'].shape
(60000, 784)
1
2
img = minist['images']
img[0].shape # 一维
(784,)
1
2
3
4
5
img_0 = img[0].reshape(28,28)

%matplotlib inline
import matplotlib.pyplot as plt
plt.imshow(img_0,cmap="Greys")
<matplotlib.image.AxesImage at 0x2d675185120>


png

mnist中的数据按照uint8存储,所以加和是错误的

1
np.uint8(255).astype(int).dtype
dtype('int32')

numpy/scipy universal functions(ufuncs)

  • support vectorized operations
  • np.min快于min,max
1
sum(img.reshape(len(img),28,28)).shape
(28, 28)
1
np.mean(img)
33.318421449829934
1
img.reshape(-1).shape
(47040000,)

按照维度相加

1
plt.imshow(img[minist['labels']==8].mean(axis=0).reshape(28,28))
<matplotlib.image.AxesImage at 0x2d675301ab0>


png

Numpy example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

salaries_full = pd.read_csv(
"9/salaries_clean.csv", encoding="latin1", index_col="salary_id"
)
columns = [
"location_latitude",
"location_longitude",
"total_experience_years",
"employer_experience_years",
"annual_base_pay",
"signing_bonus",
]
salary_data = salaries_full[columns].to_numpy()
salary_data
array([[ 3.7770e+01, -1.2241e+02,  1.3000e+01,  2.0000e+00,  1.2500e+05,
         5.0000e+03],
       [ 3.6360e+01, -9.4200e+01,  1.5000e+01,  8.0000e+00,  6.5000e+04,
                nan],
       [ 4.1470e+01, -8.1670e+01,  4.0000e+00,  1.0000e+00,  8.6000e+04,
         5.0000e+03],
       ...,
       [        nan,         nan,  7.0000e+00,  1.0000e+00,  1.5000e+05,
         0.0000e+00],
       [ 3.8990e+01, -7.6930e+01,  5.0000e+00,  1.0000e+00,  7.5000e+04,
                nan],
       [        nan,         nan,  2.0000e+00,  0.0000e+00,  5.0000e+01,
         0.0000e+00]])

Numpy indexing conventirons:

  • outermost,innermost
1
salary_data[:,0].shape
(1655,)
1
2
3
4
col = np.array(columns)
col=='annual_base_pay'
annual_base_pay = salary_data[:,col=='annual_base_pay']
np.mean(annual_base_pay)
nan
  • index 可以boolean
1
np.nanmean(annual_base_pay)
6325003.746965475
1
np.isnan(annual_base_pay)
array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])
1
annual_base_pay.reshape(-1)
array([1.25e+05, 6.50e+04, 8.60e+04, ..., 1.50e+05, 7.50e+04, 5.00e+01])
1
annual_base_pay[np.isnan(annual_base_pay)]
array([nan, nan, nan, nan])
  • 取反
1
~np.isnan(annual_base_pay)
array([[ True],
       [ True],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])
1
2
3
np.all(~np.isnan(salary_data),axis=1)
np.any(np.isnan(salary_data),axis=1)

array([False,  True, False, ...,  True,  True,  True])
1
salary_data[~np.any(np.isnan(salary_data),axis=1)]
array([[ 3.7770e+01, -1.2241e+02,  1.3000e+01,  2.0000e+00,  1.2500e+05,
         5.0000e+03],
       [ 4.1470e+01, -8.1670e+01,  4.0000e+00,  1.0000e+00,  8.6000e+04,
         5.0000e+03],
       [ 3.8000e+01, -9.7000e+01,  5.0000e+00,  1.5000e+00,  4.0000e+04,
         0.0000e+00],
       ...,
       [ 4.0030e+01, -7.5630e+01,  2.0000e+00,  1.0000e+00,  6.0000e+04,
         0.0000e+00],
       [ 3.9950e+01, -7.5160e+01,  6.0000e+00,  2.5000e+00,  1.1100e+05,
         5.0000e+03],
       [ 4.0020e+01, -1.0525e+02,  1.0000e+00,  5.0000e-01,  6.0500e+04,
         0.0000e+00]])
  • last 2 rows
1
salary_data[-2:]
array([[ 3.899e+01, -7.693e+01,  5.000e+00,  1.000e+00,  7.500e+04,
               nan],
       [       nan,        nan,  2.000e+00,  0.000e+00,  5.000e+01,
         0.000e+00]])
  • list 是行的index
1
salary_data[[1,2,3]]
array([[ 3.636e+01, -9.420e+01,  1.500e+01,  8.000e+00,  6.500e+04,
               nan],
       [ 4.147e+01, -8.167e+01,  4.000e+00,  1.000e+00,  8.600e+04,
         5.000e+03],
       [       nan,        nan,  4.000e+00,  0.000e+00,  1.050e+05,
         5.000e+03]])
  • tuple 是坐标
1
2
# salary_data[(1,2,3)] # Error: () 代表坐标
salary_data[(1,2)]
15.0

Normalizing Data

1
col_mean = np.nanmean(salary_data,axis=0)
array([ 3.77543939e+01, -6.47122727e+01,  6.75611318e+00,  2.65616294e+00,
        6.32500375e+06,  2.24056269e+04])
  • Broad casting

Note: 如果矩阵很大,可能会溢出,这样就得用for loop来写

Broad cast的矩阵,要么某些维度数量match,要么就是1或没有

  • \(5\times4\) ; 1
  • \(5\times4\) ; 4
  • 但是\(5\times4\)如果要每行减去一列数字,不能是一个长度为5的array,因为匹配的是innermost index,应该reshape(1,5) 或者 [:,None]即最后加一个维度=1
1
2
3
salary_data.mean(1)
salary_data.mean(axis=1) # mean of each row

array([21655.06,      nan, 15160.8 , ...,      nan,      nan,      nan])
1
2
3
row_mu = salary_data.mean(axis=1)
row_mu.shape,row_mu[:,None].shape,row_mu[:,None,None].shape

((1655,), (1655, 1), (1655, 1, 1))
1
salary_data-row_mu[:,None]
array([[-21617.29, -21777.47, -21642.06, -21653.06, 103344.94, -16655.06],
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [-15119.33, -15242.47, -15156.8 , -15159.8 ,  70839.2 , -10160.8 ],
       ...,
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [      nan,       nan,       nan,       nan,       nan,       nan],
       [      nan,       nan,       nan,       nan,       nan,       nan]])
1
np.array(1).shape,np.array(1)[None].shape
((), (1,))
1
2
row_mu = salary_data.mean(axis=1,keepdims=True)
row_mu.shape
(1655, 1)
  • Reductions and axis=
    • that axis will disapear
1
salary_data.mean(axis=1)
array([21655.06,      nan, 15160.8 , ...,      nan,      nan,      nan])

for loops too slow

1
2
prices = np.array([20,18,21,14])
prices - np.minimum.accumulate(prices)
array([0, 0, 3, 0])
1
2
3
4
5
6
7
n = 10
p = 2
k = 3

x = np.random.rand(n,p)
c = np.random.rand(k,p)
x.shape
(10, 2)
1
x
array([[0.57019677, 0.43860151],
       [0.98837384, 0.10204481],
       [0.20887676, 0.16130952],
       [0.65310833, 0.2532916 ],
       [0.46631077, 0.24442559],
       [0.15896958, 0.11037514],
       [0.65632959, 0.13818295],
       [0.19658236, 0.36872517],
       [0.82099323, 0.09710128],
       [0.83794491, 0.09609841]])
1
np.random.rand(10,2)
array([[0.58651293, 0.02010755],
       [0.82894003, 0.00469548],
       [0.67781654, 0.27000797],
       [0.73519402, 0.96218855],
       [0.24875314, 0.57615733],
       [0.59204193, 0.57225191],
       [0.22308163, 0.95274901],
       [0.44712538, 0.84640867],
       [0.69947928, 0.29743695],
       [0.81379782, 0.39650574]])
  • (n,1,p)-(1,k,p)是一个(n,k,p)的array
1
2
D = x[:,None,:]-c[None,:]
D[0]
array([[-0.40626269, -0.03004969],
       [-0.40656432, -0.16624401],
       [-0.16906681,  0.39941372]])
1
D.shape
(10, 3, 2)
  • 第三个维度被reduction
1
x_c_norm = np.linalg.norm(D,axis=2)
1
np.argmin(x_c_norm,axis=1)
array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)
1
2
3
4
5
6
import numpy as np

k=3
a= np.array([0,1,2,3,4,5])
ans = a[:,None]+a[None,:]
ans[:len(a)-k+1,:k]
array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4],
       [3, 4, 5]])
1
2
3
4
5
import numpy as np

x = np.arange(10)
x[np.logical_and(x>3,x<7)]

array([4, 5, 6])
1
x[np.logical_or(x>3,x<7)]
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
1
2
np.random.random((2,3))

array([[0.8754406 , 0.31234192, 0.21472602],
       [0.1417056 , 0.6179606 , 0.0532995 ]])
1
2
np.random.normal(0,1,20)

array([ 1.36952814,  0.39363222,  0.48644709, -0.23743297,  0.21279548,
       -0.5515374 , -2.22034906, -1.41651274,  0.08532262,  0.49804278,
       -2.03253663,  0.66368554,  0.28155033,  0.94521731,  0.5938768 ,
       -1.57368613, -1.55284648,  0.54146951, -0.75257585, -0.23976487])
1
np.random.uniform(0,1,(2,4))
array([[0.5909873 , 0.8235658 , 0.14703172, 0.94326283],
       [0.79080321, 0.34115745, 0.75083531, 0.31725324]])
1
2
3
4
5
x=np.arange(1,11)
for i in range(5):
print(np.random.choice(x,5,False,x/float(sum(x))))


[ 6 10  7  9  5]
[ 9  8 10  7  6]
[ 2  8  9 10  5]
[10  9  8  3  6]
[7 8 6 5 9]
1
2
3
x = np.arange(10)
np.random.shuffle(x)
x
array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])
1
np.random.permutation(x) #x not change
array([1, 9, 5, 7, 3, 4, 2, 0, 6, 8])
1
x
array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])
1
np.sort(x)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Pandas

basic structures

  • Series
    • One dimensional labeled array
  • DataFrame
    • tabel of rows, with labeled columns
1
2
3
4
import numpy as np

a= np.array([0,1,2,3,4,5])
(a[:,None]+a[None,:])[:4,:3]
array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4],
       [3, 4, 5]])
1
2
import pandas as pd
flight = pd.read_csv("10/flights.csv.gz")

Series

  • can create a pandas series from any array-like structure(dict,numpy array,list)
  • 和numpy不同的是:有label
    • 可以自己改label
1
2
3
4
import numpy as np
numbers = np.random.randn(5)
s = pd.Series(numbers)
s
0    1.556860
1    0.187127
2   -0.245997
3    1.342677
4   -0.117419
dtype: float64
1
2
3
idx = ['a','b','c','d','e']
s = pd.Series(numbers,index=idx)
s
a    1.556860
b    0.187127
c   -0.245997
d    1.342677
e   -0.117419
dtype: float64

providing too few or too many indices is a ValueError

1
pd.Series(np.random.rand(5),index=['a','b','c','d','e'])
a    0.996875
b    0.893754
c    0.925473
d    0.853753
e    0.500225
dtype: float64

can create a Series from a dict

1
2
d = {"dog":1,"cat":2}
pd.Series(d)
dog    1
cat    2
dtype: int64
1
pd.Series(d,index=["dog","cat","pig"])
dog    1.0
cat    2.0
pig    NaN
dtype: float64

字典里没有pig,所以这里赋值给pig:NaN

inf==inf+X

np.nan不等于np.nan

用np.isnan()判断

或者用变量不等于自身判断nan,这是一个变量不等于自身的特例

1
s[0]
1.5568595147700188
1
s[-1]
-0.11741940757185318
1
s[:3]
a    1.556860
b    0.187127
c   -0.245997
dtype: float64
1
s['a']
1.5568595147700188

这里需要注意的是:用:以及index名字分割数组时,:后面的也会包含进去。但是数字不会,哪怕索引名字是数字。

1
s['a':'c']
a    1.556860
b    0.187127
c   -0.245997
dtype: float64
1
s[0:3]
a    1.556860
b    0.187127
c   -0.245997
dtype: float64
1
np.exp(s[:3])
a    4.743900
b    1.205781
c    0.781924
dtype: float64
  • indices need not be unique
1
2
s = pd.Series([2,3,5,7,11],index=['a','a','a','c','d'])
s
a     2
a     3
a     5
c     7
d    11
dtype: int64
1
s['a']
a    2
a    3
a    5
dtype: int64
1
s[0]
2
1
s['a':'c']
a    2
a    3
a    5
c    7
dtype: int64
1
s[0:3]
a    2
a    3
a    5
dtype: int64

和np一样,可以用boolean series索引

1
s[s>5]
c     7
d    11
dtype: int64
1
s>5
a    False
a    False
a    False
c     True
d     True
dtype: bool

Series objects are dict-like, in that we can access and update entries via their keys.

Like dictionary, accessing a non-existent key is a KeyError.

Series also support in operator.

also support get method in dictionary

entries in Series can be mix type, but no sequence

1
2
s.name = "random int"
s
a     2
a     3
a     5
c     7
d    11
Name: random int, dtype: int64
1
2
s = pd.Series(['dog','fish','elephant'])
s.map(lambda s:len(s))
0    3
1    4
2    8
dtype: int64
1
s.map(len)
0    3
1    4
2    8
dtype: int64

map in Series allows to change values based on another Series.

1
2
s = pd.Series(['fruit','animal','animal','fruit','fruit'],index=['apple','cat','goat','banana','kiwi'])
s
apple      fruit
cat       animal
goat      animal
banana     fruit
kiwi       fruit
dtype: object
1
2
t = pd.Series({'fruit':0,'animal':1})
s.map(t)
apple     0
cat       1
goat      1
banana    0
kiwi      0
dtype: int64

DataFrame

  • each column can have its own index. DataFrame will have all the index. 缺失值NaN
1
2
3
d = {'A':pd.Series([1,2,3],index={'cat','dog','bird'}),'B':{'dog':1,'cat':2,'bird':3,'goat':4}}
df = pd.DataFrame(d)
df
A B
bird 2.0 3
cat 3.0 2
dog 1.0 1
goat NaN 4
1
df['A']
bird    2.0
cat     3.0
dog     1.0
goat    NaN
Name: A, dtype: float64

By default: rows and columns are ordered alphabetically

1
df.columns
Index(['A', 'B'], dtype='object')
1
df.index
Index(['bird', 'cat', 'dog', 'goat'], dtype='object')
1
df['C']=df['A']*2
1
df
A B C
bird 2.0 3 4.0
cat 3.0 2 6.0
dog 1.0 1 2.0
goat NaN 4 NaN
1
flight['max_delay']=list(map(max,flight['arr_delay'],flight['dep_delay']))
1
2
v = np.array([1,2,3,4,5])
np.maximum(v,v*2)
array([ 2,  4,  6,  8, 10])
1
np.max(v)
5
1
flight['max_delay'] = np.maximum(flight['arr_delay'],flight['dep_delay'])
1
2
flight['num']=0 # Broadcast
flight['num']
0         0
1         0
2         0
3         0
4         0
         ..
336771    0
336772    0
336773    0
336774    0
336775    0
Name: num, Length: 336776, dtype: int64
1
2
del flight['num']
flight.columns
Index(['year', 'month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay',
       'arr_time', 'sched_arr_time', 'arr_delay', 'carrier', 'flight',
       'tailnum', 'origin', 'dest', 'air_time', 'distance', 'hour', 'minute',
       'time_hour', 'max_delay'],
      dtype='object')
1
df.loc['cat'] # select rows by labels
A    3.0
B    2.0
Name: cat, dtype: float64
1
df.iloc[1] # select rows by their integer labels(starting from 0)
A    3.0
B    2.0
Name: cat, dtype: float64
1
df[1:3] # rows
A B
cat 3.0 2
dog 1.0 1
1
df[df['A']>2]
A B
cat 3.0 2
1
df.iloc[:2,:1] # 这里无法去掉iloc
A
bird 2.0
cat 3.0
1
df.loc['cat':'dog','A']
cat    3.0
dog    1.0
Name: A, dtype: float64
1
2
3
df1 = pd.DataFrame(np.random.randn(8,4),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.random.randn(5,3),columns=['a','b','c'])
df1+df2
a b c d
0 0.150499 -0.084967 -1.304810 NaN
1 -1.723753 0.809527 -3.027434 NaN
2 -1.770957 -1.115960 -0.606153 NaN
3 -1.322787 -0.154228 0.316819 NaN
4 0.357720 2.129364 2.252070 NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
7 NaN NaN NaN NaN
1
df2-df2.iloc[0] # Broadcast
a b c
0 0.000000 0.000000 0.000000
1 -0.406540 -0.015429 -0.683493
2 -0.095646 -1.092577 1.542312
3 -0.462943 -0.349725 1.856521
4 0.749604 1.157111 1.999972
1
(df1>0.5).any()
a    True
b    True
c    True
d    True
dtype: bool
1
(df1>0.5).all()
a    False
b    False
c    False
d    False
dtype: bool
1
df1.T*df1.T
0 1 2 3 4 5 6 7
a 0.039975 1.607250 2.643456 0.656756 0.117268 0.264463 0.471706 1.462087
b 0.012506 0.636951 0.002525 0.028437 0.893762 2.970262 0.084393 3.932983
c 0.002420 1.184460 0.797188 0.080707 2.273190 0.834139 2.697453 0.488765
d 0.650234 0.199167 0.196074 0.403005 0.352569 1.245693 0.628392 0.021431
1
df1.T.dot(df1) # numpy-like
a b c d
a 7.262961 -4.896682 2.348484 -2.077213
b -4.896682 8.561819 3.042636 2.741910
c 2.348484 3.042636 8.358324 -0.150733
d -2.077213 2.741910 -0.150733 3.696565
1
df1.T@df1
a b c d
a 7.262961 -4.896682 2.348484 -2.077213
b -4.896682 8.561819 3.042636 2.741910
c 2.348484 3.042636 8.358324 -0.150733
d -2.077213 2.741910 -0.150733 3.696565
1
df.dropna(axis=1,how='any') # drop columns
B
bird 3
cat 2
dog 1
goat 4
1
df.dropna() # drop rows
A B
bird 2.0 3
cat 3.0 2
dog 1.0 1
1
df.dropna(axis=1,how="all") # drop columns with all missing data
A B
bird 2.0 3
cat 3.0 2
dog 1.0 1
goat NaN 4

pandas supports read/write for a wide range of different file format

1
flight.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute          336776 non-null  int64  
 18  time_hour       336776 non-null  object 
 19  max_delay       327346 non-null  float64
dtypes: float64(6), int64(9), object(5)
memory usage: 51.4+ MB
1
flight.head()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour max_delay
0 2013 1 1 517.0 515 2.0 830.0 819 11.0 UA 1545 N14228 EWR IAH 227.0 1400 5 15 2013-01-01T10:00:00Z 11.0
1 2013 1 1 533.0 529 4.0 850.0 830 20.0 UA 1714 N24211 LGA IAH 227.0 1416 5 29 2013-01-01T10:00:00Z 20.0
2 2013 1 1 542.0 540 2.0 923.0 850 33.0 AA 1141 N619AA JFK MIA 160.0 1089 5 40 2013-01-01T10:00:00Z 33.0
3 2013 1 1 544.0 545 -1.0 1004.0 1022 -18.0 B6 725 N804JB JFK BQN 183.0 1576 5 45 2013-01-01T10:00:00Z -1.0
4 2013 1 1 554.0 600 -6.0 812.0 837 -25.0 DL 461 N668DN LGA ATL 116.0 762 6 0 2013-01-01T11:00:00Z -6.0
1
flight.tail()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour max_delay
336771 2013 9 30 NaN 1455 NaN NaN 1634 NaN 9E 3393 NaN JFK DCA NaN 213 14 55 2013-09-30T18:00:00Z NaN
336772 2013 9 30 NaN 2200 NaN NaN 2312 NaN 9E 3525 NaN LGA SYR NaN 198 22 0 2013-10-01T02:00:00Z NaN
336773 2013 9 30 NaN 1210 NaN NaN 1330 NaN MQ 3461 N535MQ LGA BNA NaN 764 12 10 2013-09-30T16:00:00Z NaN
336774 2013 9 30 NaN 1159 NaN NaN 1344 NaN MQ 3572 N511MQ LGA CLE NaN 419 11 59 2013-09-30T15:00:00Z NaN
336775 2013 9 30 NaN 840 NaN NaN 1020 NaN MQ 3531 N839MQ LGA RDU NaN 431 8 40 2013-09-30T12:00:00Z NaN
1
2
3
df1 = df*2
df2 = df+df
(df1==df2).all() # 有NaN,判断相等时要小心
A    False
B     True
dtype: bool
1
np.nan==np.nan
False
1
df1.equals(df2)
True
1
df1.mean(0) # mean of colums
A    4.0
B    5.0
dtype: float64
1
df1.mean(1) # mean of rows
bird    5.0
cat     5.0
dog     2.0
goat    8.0
dtype: float64
1
np.mean(df1) # skip na, 也可以设置skipna=False
A    4.0
B    5.0
dtype: float64

np.mean作用于dataframe时,自动跳过nan,但是on np.array时,返回nan

1
flight.describe()
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay flight air_time distance hour minute max_delay
count 336776.0 336776.000000 336776.000000 328521.000000 336776.000000 328521.000000 328063.000000 336776.000000 327346.000000 336776.000000 327346.000000 336776.000000 336776.000000 336776.000000 327346.000000
mean 2013.0 6.548510 15.710787 1349.109947 1344.254840 12.639070 1502.054999 1536.380220 6.895377 1971.923620 150.686460 1039.912604 13.180247 26.230100 16.969943
std 0.0 3.414457 8.768607 488.281791 467.335756 40.210061 533.264132 497.457142 44.633292 1632.471938 93.688305 733.233033 4.661316 19.300846 42.376455
min 2013.0 1.000000 1.000000 1.000000 106.000000 -43.000000 1.000000 1.000000 -86.000000 1.000000 20.000000 17.000000 1.000000 0.000000 -33.000000
25% 2013.0 4.000000 8.000000 907.000000 906.000000 -5.000000 1104.000000 1124.000000 -17.000000 553.000000 82.000000 502.000000 9.000000 8.000000 -4.000000
50% 2013.0 7.000000 16.000000 1401.000000 1359.000000 -2.000000 1535.000000 1556.000000 -5.000000 1496.000000 129.000000 872.000000 13.000000 29.000000 1.000000
75% 2013.0 10.000000 23.000000 1744.000000 1729.000000 11.000000 1940.000000 1945.000000 14.000000 3465.000000 192.000000 1389.000000 17.000000 44.000000 19.000000
max 2013.0 12.000000 31.000000 2400.000000 2359.000000 1301.000000 2400.000000 2359.000000 1272.000000 8500.000000 695.000000 4983.000000 23.000000 59.000000 1301.000000

pipe method: method chaining

1
2
3
4
5
f = lambda x : x**2
g = lambda x : x+1
h = lambda x : 2*x
df = pd.DataFrame(np.random.randn(5,3),columns=['A','B','C'])
df
A B C
0 -0.413218 0.348975 -1.313533
1 1.658054 0.495508 2.021334
2 -1.136187 0.002771 -1.651880
3 -1.125284 0.205869 -0.670324
4 1.085758 -0.103062 2.407278
1
h(g(f(df)))
A B C
0 2.341498 2.243567 5.450740
1 7.498284 2.491057 10.171579
2 4.581844 2.000015 7.457418
3 4.532527 2.084764 2.898668
4 4.357740 2.021243 13.589975
1
df.pipe(f).pipe(g).pipe(h)
A B C
0 2.341498 2.243567 5.450740
1 7.498284 2.491057 10.171579
2 4.581844 2.000015 7.457418
3 4.532527 2.084764 2.898668
4 4.357740 2.021243 13.589975

row- and column-wise function: apply()

1
flight.apply()

Pandas II

1
2
import pandas as pd
import numpy as np
1
2
s = pd.Series(np.random.randn(8))
s.pct_change()
0         NaN
1   -3.496694
2   -1.735719
3   -0.905374
4    2.515764
5   -1.755356
6    0.058851
7    2.335997
dtype: float64
1
2
3
s1 = pd.Series(np.random.randn(100))
s2 = pd.Series(0.1*s1+np.random.randn(100))
s1.cov(s2)
0.2999112016156914
1
2
s = pd.Series(np.random.randn(5),index=list('abcde'))
s
a   -0.709341
b    0.346186
c    0.136598
d   -0.826597
e   -1.181088
dtype: float64
1
s.rank()
a    3.0
b    5.0
c    4.0
d    2.0
e    1.0
dtype: float64
1
2
s[0]=s[1]=0
s.rank()
a    3.5
b    3.5
c    5.0
d    2.0
e    1.0
dtype: float64
1
2
tsdf = pd.DataFrame(np.random.randn(10,3),columns=['DOW','NASDAQ','SP500'],index=pd.date_range('1/1/2000',periods=10))
tsdf
DOW NASDAQ SP500
2000-01-01 0.656688 -1.531741 -0.024392
2000-01-02 -0.706209 0.325865 -0.329847
2000-01-03 -0.127025 0.806435 -0.910426
2000-01-04 -0.401866 0.191501 -0.204383
2000-01-05 -1.283493 1.175840 -1.195107
2000-01-06 0.467726 -0.255641 0.056299
2000-01-07 -0.371994 0.140242 -0.650468
2000-01-08 0.818794 0.461664 1.267799
2000-01-09 2.121635 1.405991 -0.179316
2000-01-10 -1.062810 0.308460 1.338484

apply to each column

1
tsdf.agg([np.median,np.mean,np.std])
DOW NASDAQ SP500
median -0.249509 0.317162 -0.191849
mean 0.011145 0.302862 -0.083136
std 1.022643 0.814786 0.829758

agg can take dict as functions

1
2
tsdf.agg({'DOW':'mean','NASDAQ':'median','SP500':'max'})

DOW       0.011145
NASDAQ    0.317162
SP500     1.338484
dtype: float64
1
tsdf.corr(method='spearman')
DOW NASDAQ SP500
DOW 1.000000 -0.018182 0.369697
NASDAQ -0.018182 1.000000 -0.345455
SP500 0.369697 -0.345455 1.000000
1
tsdf.corr(method='pearson')
DOW NASDAQ SP500
DOW 1.000000 -0.012922 0.191341
NASDAQ -0.012922 1.000000 -0.222936
SP500 0.191341 -0.222936 1.000000

dataframe ranks columns(default)

1
tsdf.rank()
DOW NASDAQ SP500
2000-01-01 8.0 1.0 7.0
2000-01-02 3.0 6.0 4.0
2000-01-03 6.0 8.0 2.0
2000-01-04 4.0 4.0 5.0
2000-01-05 1.0 9.0 1.0
2000-01-06 7.0 2.0 8.0
2000-01-07 5.0 3.0 3.0
2000-01-08 9.0 7.0 9.0
2000-01-09 10.0 10.0 6.0
2000-01-10 2.0 5.0 10.0
1
tsdf.rank(1)
DOW NASDAQ SP500
2000-01-01 3.0 1.0 2.0
2000-01-02 1.0 3.0 2.0
2000-01-03 2.0 3.0 1.0
2000-01-04 1.0 3.0 2.0
2000-01-05 1.0 3.0 2.0
2000-01-06 3.0 1.0 2.0
2000-01-07 2.0 3.0 1.0
2000-01-08 2.0 1.0 3.0
2000-01-09 3.0 2.0 1.0
2000-01-10 1.0 2.0 3.0

iterating over Series get an iterator on values

iterating over dataframe get an iterator over column names

df.iteritems
df.iterrows

pandas,Group By

1
2
df = pd.DataFrame({'A':['plant','animal','plant','plant'],'B':['apple','goat','kiwi','grape'],'C':np.random.randn(4),'D':np.random.randn(4)})
df
A B C D
0 plant apple 2.392546 -0.554108
1 animal goat -1.292467 0.800007
2 plant kiwi 0.718886 0.600089
3 plant grape -0.665115 -1.205696
1
df.groupby('A')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000155D469CD00>
1
df.groupby('A').groups
{'animal': [1], 'plant': [0, 2, 3]}
1
df.groupby('A').mean()
C D
A
animal -1.292467 0.800007
plant 0.815439 -0.386571
1
2
3
index = pd.date_range('10/1/1999',periods=1100)
ts = pd.Series(np.random.normal(0.5,2,1100),index)
ts.head()
1999-10-01    1.318674
1999-10-02    2.784437
1999-10-03    4.774021
1999-10-04    0.017314
1999-10-05    2.678237
Freq: D, dtype: float64
1
2
3
4
key = lambda d: d.year
zscore = lambda x: (x-x.mean())/x.std()
transformed = ts.groupby(key).transform(zscore)
transformed.head()
1999-10-01    0.409280
1999-10-02    1.168705
1999-10-03    2.199526
1999-10-04   -0.264966
1999-10-05    1.113682
Freq: D, dtype: float64

filter return ungrouped resulted

1
2
sf = pd.Series([1,1,2,2,3,3])
sf.groupby(sf).filter(lambda x: x.sum()>2)
2    2
3    2
4    3
5    3
dtype: int64

df.groupby().agg()
df.groupby().transform()

df.concat
df.pivot

1
2
3
4
5
6
7
8
9
10
11
np.random.seed(0)

df4 = pd.DataFrame({
"stock":['A', 'B', 'C', 'A', 'B','C','A','C','A','B','C','A','B','C','D','D','D','D','A','B','C','D'],
"date":['20210104','20210104','20210104','20210102','20210102','20210102',
'20210103', '20210103', '20210101','20210101','20210101',
'20210105','20210105','20210105','20210102','20210103','20210104','20210105',
'20210106', '20210106', '20210106','20210106'],
"daily_ret":np.random.normal(0,0.05,22)})
df4_stacked = df4.pivot(index='date',columns='stock',values='daily_ret')
df4_stacked
stock A B C D
date
20210101 -0.005161 0.020530 0.007202 NaN
20210102 0.112045 0.093378 -0.048864 0.022193
20210103 0.047504 NaN -0.007568 0.016684
20210104 0.088203 0.020008 0.048937 0.074704
20210105 0.072714 0.038052 0.006084 -0.010258
20210106 0.015653 -0.042705 -0.127649 0.032681
1
2
s = df4_stacked.stack()
s
date      stock
20210101  A       -0.005161
          B        0.020530
          C        0.007202
20210102  A        0.112045
          B        0.093378
          C       -0.048864
          D        0.022193
20210103  A        0.047504
          C       -0.007568
          D        0.016684
20210104  A        0.088203
          B        0.020008
          C        0.048937
          D        0.074704
20210105  A        0.072714
          B        0.038052
          C        0.006084
          D       -0.010258
20210106  A        0.015653
          B       -0.042705
          C       -0.127649
          D        0.032681
dtype: float64
1
2
s['20210104','A']
s['20210104']['A']
0.0882026172983832

matplotlib

1
2
3
4
import matplotlib.pyplot as plt
%matplotlib inline
x = np.arange(10)
_ = plt.plot(x**2)


png

1
2
import matplotlib as mpl
mpl.style.available
['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10']
1
2
mpl.style.use('ggplot')
mpl.style.use('seaborn-white')
1
2
3
%matplotlib inline
x = np.arange(10)
_ = plt.plot(x**2)


png

1
2
3
t = np.arange(10)
s = np.random.normal(1,1,10)
_ = plt.bar(t,s,align='center')


png

1
2
3
fig,axs = plt.subplots(nrows=3,ncols=3)
x = np.arange(0,100)
axs[0][0].plot(x,x**2)
[<matplotlib.lines.Line2D at 0x155d7666eb0>]


png

Regular Expressions and Text Encoding

Unicode ASCII sebset of UTF-8

  • re.match() at start of string
  • re.search() any part of string
  • re.findall() find all matches of parttern in the string
  • https://rex101.com
1
2
import re
re.match('cat','cats and dogs')
<re.Match object; span=(0, 3), match='cat'>
1
2
regex = 'cat'
re.match(regex,'fat on mat') is None
True
1
2
3
4
if re.match(regex,'fat on mat'):
print('matches')
else:
print('Not')
Not
1
re.search('cat','rains cats and dogs')
<re.Match object; span=(6, 9), match='cat'>
1
2
if re.search('cat','rains cats and dogs'):
print('matches')
matches
1
re.findall('cat','rains cats and cat')
['cat', 'cat']
1
re.match('\\\\\$','$')
1
re.match('\\\\\$','\$')
<re.Match object; span=(0, 2), match='\\$'>
1
re.match('\$','$')
<re.Match object; span=(0, 1), match='$'>
  • raw text
    • is new line
    • r"" == "\n", raw text
  • 推荐在用字符串的时候都加上r
1
print(r"\n1.00","\n1.00")
\n1.00 
1.00
1
re.match('$','$')
1
re.match('\$','$')
<re.Match object; span=(0, 1), match='$'>
1
re.match(r'\$','$')
<re.Match object; span=(0, 1), match='$'>
1
re.findall("[aeiou]","cats and dog")
['a', 'a', 'o']
1
re.findall("[a-z ]","cats and Dog")
['c', 'a', 't', 's', ' ', 'a', 'n', 'd', ' ', 'o', 'g']
1
re.findall("[a-z]","cats and Dog")
['c', 'a', 't', 's', 'a', 'n', 'd', 'o', 'g']
1
re.findall("[A-Z]","cats and Dog")
['D']
1
re.findall("[a-zA-Z]","cats and Dog")
['c', 'a', 't', 's', 'a', 'n', 'd', 'D', 'o', 'g']
1
re.findall("[^a-zA-Z]","cats and Dog") # not in the list
[' ', ' ']
1
re.findall("[a\-z]","i am a z-ip")
['a', 'a', 'z', '-']
1
re.findall("[(+*)^]","1+1+2^1=3*(1)")
['+', '+', '^', '*', '(', ')']
1
re.findall("[^(+*)^]","1+1+2^1=3*(1)")
['1', '1', '2', '1', '=', '3', '1']
  • : space, tab, new line, = [ ]
  • r"" = r"[a-zA-Z_]"
  • digits = r"[0-9]"
  • "^": not in the [], start of a line
  • new line 开头有个invisible ^, end line have invisible $
  • "$": end of a line
  • ".": including everything
  • 只想匹配句号的话用:"."
  • "o..r" match "o任意两个字母r"
  • word boundary: a 以a结尾的单词
  • 如果想neglect上述,只需要大写,例如:, neglect all the words
  • +: "e+" match any number of e's
  • : "e"
1
re.findall("oe+","oeuvree")
['oe']
1
re.findall("[oe]+","oeuvree")
['oe', 'ee']
1
re.findall("[whe]*","while wheeee weheee")
['wh', '', '', 'e', '', 'wheeee', '', 'weheee', '']

‘*’ : zero or more of the previous item

‘+’ : one or more of the previous item

‘?’ : zero or one of the previous item

‘{4}’ : exactly four of the previous item

‘{3,}’ : three or more of previous item

‘{2,5}’ : between two and five (inclusive) of previous item

1
re.findall("whe*","while wheeee weheee heee weee")
['wh', 'wheeee']

‘|’ (“pipe”) is a special character that allows one to specify “or” clauses

Example: I want to match the word “cat” or the word “dog”

Solution: ‘(cat|dog)’ 记得加括号!!!

Note: parentheses are not strictly necessary here, but parentheses tend to make for easier reading and avoid possible ambiguity. It’s a good habit to just use them always.

1
re.findall(r"a|aa|aaa","aaaa")
['a', 'a', 'a', 'a']
1
re.findall(r"a+","aaaa")
['aaaa']

'?' modifies operators like '+' and '*' not be greedy,get lazy matching

1
re.findall(r"a+?","aaaa")
['a', 'a', 'a', 'a']
1
re.findall("\w+@\w+\.\w+","your email is kobe@icloud.com, his email is jacob@umich.edu")
['kobe@icloud.com', 'jacob@umich.edu']

capture group ():

  • (+@+.+)
1
2
3
4
regex = r'(\w+@\w+\.\w+)'
s = "your email is kobe@icloud.com kobe@icloud.com his email is jacob@umich.edu"
m=re.findall(regex,s)
m
['kobe@icloud.com', 'kobe@icloud.com', 'jacob@umich.edu']
  • backreferences
    • in paranthesis() occur once again
      • ([a-zA-Z]+).*\1 首尾以相同字段开始和结束,中间任意
1
2
3
4
regex = r'(\w+)\1'
s = "freefree"
m = re.search(regex,s)
m.group(1)
'free'
1
2
3
4
regex = r'\w+'
s = "nonrepeat"
m = re.search(regex,s)
m.group(0)
'nonrepeat'
1

SQL

  • 每列一个type
  • key: uniquely identify a record
  • sqliteonline.com
  • features:
    • atomicity
    • consistency
    • isolation
    • durability
  • https://www.w3schools.com/sql/
  • select
    • SELECT * FROM countries
    • SELECT id, name FROM countries
    • SELECT * FROM countries WHERE [filter]
    • <> !=
    • = ==
    • SELECT * FROM countries WHERE [] BETWEEN 86 AND 90
    • SELECT * FROM countries WHERE [filter] IS NULL
      • Note: 空格不会match NULL
  • SORT
    • ORDER BY
      • SELECT [columns] FROM [table] ORDER BY [column] [ASC|DESC]
      • SELECT [columns] FROM countries ORDER BY phonecode ASC, name DESC
      • 等价于 SELECT [columns] FROM countries ORDER BY phonecode, name DESC
      • 默认ASC
  • DISTINCT
    • SELECT DISTINCT [columns] FROM [table]
  • WHERE
    • 支持set
      • SELECT id from t_student WHERE major IN ("Mathematics","statistics")
      • SELECT id from t_student WHERE major NOT IN ("Mathematics","statistics")
    • 支持regex-like matching
      • SELCET id from t_student WHERE first_name LIKE "M%"
      • %: match any number of letters
      • _: match one number of letters
  • GROUP BY
    • SELECT customer_id, SUM(dollar_amount) FROM t_transactions GROUP BY customer_id
    • COUNT, AVG, MIN, MAX: aggregate function
    • SECLECT phonecode, COUNT() FROM countries GROUP BY phonecode
    • HAVING: filer results after GROUP BY using the HAVING keyword
      • SELECT customer_id, SUM(dollar_amount) AS total_dolar FROM t_transactions GROUP BY customer_id HAVING total_dolar >50
  • JOIN: merging tables
    • INNER JOIN
      • SELECT id, name, color FROM t_student INNER JOIN t_personal ON t_students.id=t_personal.id
    • LEFT JOIN
    • RIGHT JOIN
    • FULL JOIN
  • INSERT INTO
    • INSERT INTO table_name [col1, col2, col3, …] VALUES value1, value2, value3, …
  • UPDATE
    • UPDATE table_name SET col1=value1,col2=value2, WHERE condition
    • Caution: if WHERE is left empty, you'll delete/modify the whole tanle!
  • DELETE
    • DELETE FROM table_name WHERE condition
  • CREATE TABLE
    • CREATE TABLE table_name [col1 datatype, col2 datatype, …]
  • DROP TABLE
    • DROP TABLE table_name;
  • SELECT * FROM sqlite_master
  • SELECT * FROM sqlite_master WHERE type="table"

课件里的python例子

1
2
3
import sqlite3
con = sqlite3.connect('ps8/flights.sqlite')
cur = con.cursor()
1
con
<sqlite3.Connection at 0x2615592b040>
1
2
res = cur.execute("select * from flights limit 10") # head 10 of table
res
<sqlite3.Cursor at 0x2615594d3c0>

(Actually, res is just the original cursor itself):

1
res is cur
True
1
2
for row in res:
print(row[0])
2013
2013
2013
2013
2013
2013
2013
2013
2013
2013

Fetching results

To retrieve results, use the cur.fetchone() or cur.fetchall() methods:

1
2
3
res = cur.execute("select * from flights limit 10") # head 10 of table

cur.fetchone()
(2013,
 1,
 1,
 517,
 515,
 2.0,
 830,
 819,
 11.0,
 'UA',
 1545,
 'N14228',
 'EWR',
 'IAH',
 227.0,
 1400.0,
 5.0,
 15.0,
 1357034400.0)

This returns a tuple containing one entry for each column in our SELECT statement.

If you want all the rows, use cur.fetchall():

It is an iterator

1
len(res.fetchall())
9

Notice that .fetchall() works like an iterator--we SELECTed 10 rows, fetched one of them using .fetchone(), and then fetched the remaining 9 using .fetchall(). If we run this again we'll get zero rows back because we already fetched everything:

1
len(res.fetchall())
0

Parameterized queries

Sometimes our queries must depend on user input. Consider the following query which returns all the flights that happened in a certain month:

1
2
3
4
5
def num_flights_in_month(month):
query = """SELECT COUNT() FROM flights WHERE month = %d""" % month
return cur.execute(query).fetchone()[0]

num_flights_in_month(1)
27004

For queries that depend on a parameter, we can use the special placeholder ? to have the database do parameter substitution for us:

1
2
3
4
5
def num_flights_in_month(m):
query = """SELECT COUNT() FROM flights WHERE month = ?"""
return cur.execute(query, [m]).fetchone()[0]

num_flights_in_month(1)
27004

Using multiple placeholders is possible:

1
2
3
4
5
def num_flights_on_day(m, d):
query = """SELECT COUNT() FROM flights WHERE month = ? AND day = ?"""
return cur.execute(query, [m, d]).fetchone()[0]

num_flights_on_day(12, 25)
719

Notice that, even if there is just one parameter, you have to pass in all the parameters as a list.

Why use placeholders

Consider the following table:

1
2
3
4
5
6
7
8
scon = sqlite3.connect(":memory:") 
scon.execute('CREATE TABLE users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT, age INT, password TEXT)')
scon.execute("""INSERT INTO users(username, age, password) VALUES
('Jonathan', 40, 'beeri$g00d'),
('Sarah', 41, 'wat3rp0l0'),
('Julian', 2, 'il0vecr0ut0n3')
""")
scon.commit()

Here's our table:

1
2
import pandas as pd
pd.read_sql_query("SELECT * from users", scon)
id username age password
0 1 Jonathan 40 beeri$g00d
1 2 Sarah 41 wat3rp0l0
2 3 Julian 2 il0vecr0ut0n3

Now let's say we have a function which takes a user-supplied parameter, their username, and returns their user information:

1
2
3
4
5
6
def user_info(username):
q = "SELECT * FROM users WHERE username = '%s'" % username
print(q)
return pd.read_sql_query(q, scon)

user_info('Jonathan')
SELECT * FROM users WHERE username = 'Jonathan'
id username age password
0 1 Jonathan 40 beeri$g00d

This works great assuming that what got passed in is actually a username, but a malicious user could do the following:

1
user_info("""abc' or 'a'='a""")
SELECT * FROM users WHERE username = 'abc' or 'a'='a'
id username age password
0 1 Jonathan 40 beeri$g00d
1 2 Sarah 41 wat3rp0l0
2 3 Julian 2 il0vecr0ut0n3

This is known as an injection attack. For this reason, we do not passed user input directly into a query string. Instead we use placeholders, and rely on the SQL engine itself to perform the substitution:

1
2
3
4
5
6
def safe_user_info(username):
q = "SELECT * FROM users WHERE username = ?"
print(q)
return pd.read_sql_query(q, scon, params=[username])

safe_user_info('Jonathan')
SELECT * FROM users WHERE username = ?
id username age password
0 1 Jonathan 40 beeri$g00d
1
safe_user_info("""abc' or 'a'='a""")
SELECT * FROM users WHERE username = ?
id username age password

Ten questions about flights

Now we will use SQL queries to various questions about the flights dataset.

Which airport is the busiest?

We need to aggregate flights by the origin column, and they count the number of rows in each group.

1
cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin").fetchall()
[('EWR', 120835), ('JFK', 111279), ('LGA', 104662)]
1
cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin").description
(('origin', None, None, None, None, None, None),
 ('COUNT()', None, None, None, None, None, None))
1
pd.read_sql_query("SELECT origin, COUNT() FROM flights GROUP BY origin",con)
origin COUNT()
0 EWR 120835
1 JFK 111279
2 LGA 104662

Which month was coldest?

We need to aggregate weather by month, and the compute some measure of coldness.

1
cur.execute("SELECT month, AVG(temp) FROM weather GROUP BY month").fetchall()
[(1, 35.63566037735852),
 (2, 34.27059701492533),
 (3, 39.880071845532164),
 (4, 51.745641500694774),
 (5, 61.79500000000004),
 (6, 72.18399999999997),
 (7, 80.06622082585272),
 (8, 74.4684657039714),
 (9, 67.3712922649374),
 (10, 60.07113019891505),
 (11, 44.9904343764596),
 (12, 38.44180037313427)]

This one is a bit trickier. We start by counting the number of flights for each combination of carrier and origin:

1
cur.execute("SELECT COUNT(), carrier, origin FROM flights GROUP BY carrier, origin").fetchall()
[(1268, '9E', 'EWR'),
 (14651, '9E', 'JFK'),
 (2541, '9E', 'LGA'),
 (3487, 'AA', 'EWR'),
 (13783, 'AA', 'JFK'),
 (15459, 'AA', 'LGA'),
 (714, 'AS', 'EWR'),
 (6557, 'B6', 'EWR'),
 (42076, 'B6', 'JFK'),
 (6002, 'B6', 'LGA'),
 (4342, 'DL', 'EWR'),
 (20701, 'DL', 'JFK'),
 (23067, 'DL', 'LGA'),
 (43939, 'EV', 'EWR'),
 (1408, 'EV', 'JFK'),
 (8826, 'EV', 'LGA'),
 (685, 'F9', 'LGA'),
 (3260, 'FL', 'LGA'),
 (342, 'HA', 'JFK'),
 (2276, 'MQ', 'EWR'),
 (7193, 'MQ', 'JFK'),
 (16928, 'MQ', 'LGA'),
 (6, 'OO', 'EWR'),
 (26, 'OO', 'LGA'),
 (46087, 'UA', 'EWR'),
 (4534, 'UA', 'JFK'),
 (8044, 'UA', 'LGA'),
 (4405, 'US', 'EWR'),
 (2995, 'US', 'JFK'),
 (13136, 'US', 'LGA'),
 (1566, 'VX', 'EWR'),
 (3596, 'VX', 'JFK'),
 (6188, 'WN', 'EWR'),
 (6087, 'WN', 'LGA'),
 (601, 'YV', 'LGA')]

Next, we need to rank the carriers within origin by their counts.

1
2
3
4
5
6
7
8
9
cur.execute(
"""
SELECT RANK() OVER (
PARTITION BY origin
ORDER BY n
) AS r, carrier, origin, n FROM
(SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
"""
).fetchall()
[(1, 'OO', 'EWR', 6),
 (2, 'AS', 'EWR', 714),
 (3, '9E', 'EWR', 1268),
 (4, 'VX', 'EWR', 1566),
 (5, 'MQ', 'EWR', 2276),
 (6, 'AA', 'EWR', 3487),
 (7, 'DL', 'EWR', 4342),
 (8, 'US', 'EWR', 4405),
 (9, 'WN', 'EWR', 6188),
 (10, 'B6', 'EWR', 6557),
 (11, 'EV', 'EWR', 43939),
 (12, 'UA', 'EWR', 46087),
 (1, 'HA', 'JFK', 342),
 (2, 'EV', 'JFK', 1408),
 (3, 'US', 'JFK', 2995),
 (4, 'VX', 'JFK', 3596),
 (5, 'UA', 'JFK', 4534),
 (6, 'MQ', 'JFK', 7193),
 (7, 'AA', 'JFK', 13783),
 (8, '9E', 'JFK', 14651),
 (9, 'DL', 'JFK', 20701),
 (10, 'B6', 'JFK', 42076),
 (1, 'OO', 'LGA', 26),
 (2, 'YV', 'LGA', 601),
 (3, 'F9', 'LGA', 685),
 (4, '9E', 'LGA', 2541),
 (5, 'FL', 'LGA', 3260),
 (6, 'B6', 'LGA', 6002),
 (7, 'WN', 'LGA', 6087),
 (8, 'UA', 'LGA', 8044),
 (9, 'EV', 'LGA', 8826),
 (10, 'US', 'LGA', 13136),
 (11, 'AA', 'LGA', 15459),
 (12, 'MQ', 'LGA', 16928),
 (13, 'DL', 'LGA', 23067)]



<IPython.core.display.Javascript object>

Finally, to find the most popular carriers, we can filter this table to only the rows where rank=1. To do this I will use a subquery:

SELECT * FROM (<another query>)

1
2
3
4
5
6
7
8
cur.execute("""
SELECT * FROM (SELECT RANK() OVER (
PARTITION BY origin
ORDER BY n DESC
) AS r, carrier, origin, n FROM
(SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
) WHERE r = 1
""").fetchall()
[(1, 'UA', 'EWR', 46087), (1, 'B6', 'JFK', 42076), (1, 'DL', 'LGA', 23067)]
1
2
3
4
5
6
7
8
9
cur.execute("""
SELECT * FROM (SELECT RANK() OVER (
PARTITION BY origin
ORDER BY n DESC
) AS r, carrier, origin, n FROM
(SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin)
) AS subq JOIN airlines ON airlines.carrier = subq.carrier
WHERE r = 1
""").fetchall()
[(1, 'B6', 'JFK', 42076, 'B6', 'JetBlue Airways'),
 (1, 'DL', 'LGA', 23067, 'DL', 'Delta Air Lines Inc.'),
 (1, 'UA', 'EWR', 46087, 'UA', 'United Air Lines Inc.')]

What are the busiest travel weeks of the year?

We need to figure out the week of each flight, and then aggregate them. To manipulate dates in sqlite, we have the strftime() function:

1
cur.execute("""SELECT strftime('%W', 'now')""").fetchone()
('11',)
1
cur.execute("""SELECT strftime('The day of the week is %w, the week of the year is %W', 'now')""").fetchone()
('The day of the week is 0, the week of the year is 11',)

The Unix epoch

The time_hour column in flights looks like:

1
cur.execute("""SELECT time_hour FROM flights""").fetchone()
(1357034400.0,)

This number represents the number of seconds that have elapsed since midnight GMT on January 1, 1970, a date known as the Unix epoch. We can use strftime() on these dates provided we tell it how they are formatted.

1
2
# datetime() is an alias for strftime('%Y-%m-%d %H:%M:%S', ...)
cur.execute("""SELECT datetime(1357034400, 'unixepoch')""").fetchone()
('2013-01-01 10:00:00',)

Now we can convert each date to a week of the year:

1
pd.read_sql_query("""SELECT year, month, day, strftime('%W', time_hour, 'unixepoch') from flights""", con)
year month day strftime('%W', time_hour, 'unixepoch')
0 2013 1 1 00
1 2013 1 1 00
2 2013 1 1 00
3 2013 1 1 00
4 2013 1 1 00
... ... ... ... ...
336771 2013 9 30 39
336772 2013 9 30 39
336773 2013 9 30 39
336774 2013 9 30 39
336775 2013 9 30 39

336776 rows × 4 columns

The final query looks like:

1
2
3
4
5
con.execute("""
SELECT COUNT() as n, strftime('%W', time_hour, 'unixepoch') as week FROM flights
GROUP BY week
ORDER BY n DESC
""").fetchall()[:10]
[(6760, '27'),
 (6740, '29'),
 (6737, '28'),
 (6735, '30'),
 (6733, '31'),
 (6702, '25'),
 (6687, '32'),
 (6675, '24'),
 (6639, '23'),
 (6631, '33')]

What where the ten worst days for average departure delay?

1
2
3
4
con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay
FROM flights GROUP BY year, month, day
ORDER BY avg_delay DESC
LIMIT 10''').fetchall()
[(2013, 3, 8, 83.5369211514393),
 (2013, 7, 1, 56.23382519863791),
 (2013, 9, 2, 53.029550827423165),
 (2013, 7, 10, 52.860702151755376),
 (2013, 12, 5, 52.32799013563502),
 (2013, 5, 23, 51.14471968709257),
 (2013, 9, 12, 49.95875),
 (2013, 6, 28, 48.827784156142364),
 (2013, 6, 24, 47.15741789354473),
 (2013, 7, 22, 46.66704675028506)]

What where the ten best days for departure delay?

1
2
3
4
con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay
FROM flights GROUP BY year, month, day
ORDER BY avg_delay ASC
LIMIT 10''').fetchall()
[(2013, 9, 24, -1.3298319327731092),
 (2013, 10, 2, -0.919917864476386),
 (2013, 11, 9, -0.6288515406162465),
 (2013, 9, 6, -0.39792746113989635),
 (2013, 9, 5, -0.3877973112719752),
 (2013, 10, 29, -0.34932221063607927),
 (2013, 9, 7, -0.25036390101892286),
 (2013, 11, 5, -0.18314255983350677),
 (2013, 10, 19, -0.10703812316715543),
 (2013, 10, 1, -0.09895833333333333)]

What airlines have the most departure delay?

1
2
3
4
5
6
con.execute('''SELECT airlines.carrier, name, AVG(dep_delay) AS avg_delay
FROM flights
JOIN airlines on airlines.carrier=flights.carrier
GROUP BY airlines.name
ORDER BY avg_delay DESC
LIMIT 10''').fetchall()
[('F9', 'Frontier Airlines Inc.', 20.215542521994134),
 ('EV', 'ExpressJet Airlines Inc.', 19.955389827868213),
 ('YV', 'Mesa Airlines Inc.', 18.996330275229358),
 ('FL', 'AirTran Airways Corporation', 18.72607467838092),
 ('WN', 'Southwest Airlines Co.', 17.71174377224199),
 ('9E', 'Endeavor Air Inc.', 16.725769407441433),
 ('B6', 'JetBlue Airways', 13.022522106740018),
 ('VX', 'Virgin America', 12.869421165464821),
 ('OO', 'SkyWest Airlines Inc.', 12.586206896551724),
 ('UA', 'United Air Lines Inc.', 12.106072888459614)]

What airplane flew the most days?

Airplanes are identified by their tail number.

1
2
3
4
5
6
cur.execute('''SELECT tailnum, COUNT() AS n FROM
(SELECT DISTINCT year, month, day, tailnum FROM flights)
GROUP BY tailnum
ORDER BY n DESC
LIMIT 1
''').fetchall()
[('N328AA', 336)]

Interestingly, you can Google most tail numbers: N328AA.

How many flights were cancelled?

We'll say that a cancelled flight has dep_time missing.

1
cur.execute('''SELECT COUNT() FROM flights WHERE dep_time IS NULL''').fetchone()
(0,)

Notice that we don't write = NULL. That does not work:

1
cur.execute('''SELECT COUNT() FROM flights WHERE dep_time = NULL''').fetchone()
(0,)

How many flights were destined for airports with a one word name?

(Not including the word "airport")

Here is an example of an airport with a one-word name:

1
cur.execute("SELECT * FROM airports WHERE name = 'Packwood'").description
(('faa', None, None, None, None, None, None),
 ('name', None, None, None, None, None, None),
 ('lat', None, None, None, None, None, None),
 ('lon', None, None, None, None, None, None),
 ('alt', None, None, None, None, None, None),
 ('tz', None, None, None, None, None, None),
 ('dst', None, None, None, None, None, None),
 ('tzone', None, None, None, None, None, None))



<IPython.core.display.Javascript object>

CAST(xxx AS INTEGER)

1
2
3
4
5
6
cur.execute(
"""
SELECT name
FROM airports WHERE name NOT LIKE '% %'
"""
).fetchall()
[('Packwood',),
 ('Florence',),
 ('Arcata',),
 ('Kodiak',),
 ('Addison',),
 ('Alexandria',),
 ('Centennial',),
 ('Lakeland',),
 ('Appleton',),
 ('Executive',),
 ('Avalon',),
 ('Kalamazoo',),
 ('Bethel',),
 ('Beaufort',),
 ('Bettles',),
 ('KBWD',),
 ('CedarKey',),
 ('Clemson',),
 ('Charlottesville-Albemarle',),
 ('Chehalis-Centralia',),
 ('Champaign',),
 ('Yeager',),
 ('Tri-Cities',),
 ('Decatur',),
 ('Dillingham',),
 ('Dupage',),
 ('Destin',),
 ('Unalaska',),
 ('Orlando',),
 ('Bessemer',),
 ('Wendover',),
 ('Franklin',),
 ('Tipton',),
 ('Gulkana',),
 ('Gulfport-Biloxi',),
 ('Majors',),
 ('Dillingham',),
 ('Hana',),
 ('Homer',),
 ('Iliamna',),
 ('Immokalee',),
 ('Kapalua',),
 ('Logan-Cache',),
 ('Lihue',),
 ('Lincoln',),
 ('Lanai',),
 ('Castle',),
 ('Mojave',),
 ('Downtown',),
 ('Molokai',),
 ('Manassas',),
 ('MariposaYosemite',),
 ('Lakefront',),
 ('County',),
 ('Kahului',),
 ('Nome',),
 ('Executive',),
 ('Northway',),
 ('Platinum',),
 ('Renton',),
 ('Rochester',),
 ('Prosser',),
 ('Deadhorse',),
 ('Sedona',),
 ('Gillespie',),
 ('Teterboro',),
 ('Telluride',),
 ('Talkeetna',),
 ('Toledo',),
 ('Upolu',),
 ('Apopka',),
 ('Yakutat',),
 ('Clear',)]



<IPython.core.display.Javascript object>
1
2
3
4
5
6
7
cur.execute(
"""
SELECT COUNT() FROM flights
INNER JOIN (SELECT faa FROM airports WHERE name NOT LIKE '% %') one_word
ON one_word.faa = flights.dest
"""
).fetchall()
[(190,)]



<IPython.core.display.Javascript object>