Numpy
Built on Numpy:
SciPy
Pandas
matplotlib
Tensorflow
Scikit Learn
PyTorch
JAX
Numpy quick start guide:
https://numpy.org/doc/stable/user/quickstart.html
closely related package scipy
is for opimization
Basic numerical data types:
bool
int
uint
float
complex
https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html
numpy.array
https://docs.scipy.org/doc/numpy-1.10.1/user/basics.creation.html
compared to list:
1 2 3 4 import numpy as npnp.array([1 ,2 ,3 ],dtype='uint' )
array([1, 2, 3], dtype=uint32)
array([[0., 0., 0.],
[0., 0., 0.]])
1 np.arange(2 ,3 ,0.1 ,dtype='float' )
array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])
小心大整数会导致数据溢出,不同机器结果不一样
1 2 3 import numpy as npa = np.arange(100_000_000 ,dtype='int64' ) np.sum (a)
4999999950000000
empty 有时候会有随机结果,容易出错
array([[0., 0., 0.],
[0., 0., 0.]])
arange()
1 np.arange(27 ).reshape(3 ,3 ,3 )
array([[[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8]],
[[ 9, 10, 11],
[12, 13, 14],
[15, 16, 17]],
[[18, 19, 20],
[21, 22, 23],
[24, 25, 26]]])
1 2 a = np.arange(27 ).reshape(3 ,3 ,3 ) a[0 ]
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
array([5, 6, 7, 8, 9])
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
array indexing
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array([0, 1, 2, 3, 4, 5, 6, 7])
array([1, 3, 5])
array([0, 2, 4, 6, 8])
1 2 x = np.reshape(np.arange(1 ,13 ),(3 ,4 )) x
array([[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12]])
array([5, 6, 7, 8])
array([[ 2, 4],
[ 6, 8],
[10, 12]])
array([ 2, 12])
array([ 8, 9, 10, 11, 12])
array([], shape=(0, 2, 5), dtype=int32)
1 2 3 x = np.arange(9 ).reshape(3 ,3 ) print (x)x[np.ix_([0 ,2 ],[0 ,1 ])]
[[0 1 2]
[3 4 5]
[6 7 8]]
array([[0, 1],
[6, 7]])
Example: minist
1 2 import numpy as npminist = np.load('8/mnist.npz' )
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)
['images', 'labels']
2
(60000, 784)
1 2 img = minist['images' ] img[0 ].shape
(784,)
1 2 3 4 5 img_0 = img[0 ].reshape(28 ,28 ) %matplotlib inline import matplotlib.pyplot as pltplt.imshow(img_0,cmap="Greys" )
<matplotlib.image.AxesImage at 0x2d675185120>
mnist中的数据按照uint8存储,所以加和是错误的
1 np.uint8(255 ).astype(int ).dtype
dtype('int32')
numpy/scipy universal
functions(ufuncs)
support vectorized operations
np.min
快于min
,max
1 sum (img.reshape(len (img),28 ,28 )).shape
(28, 28)
33.318421449829934
(47040000,)
按照维度相加
1 plt.imshow(img[minist['labels' ]==8 ].mean(axis=0 ).reshape(28 ,28 ))
<matplotlib.image.AxesImage at 0x2d675301ab0>
Numpy example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 import matplotlib.pyplot as pltimport numpy as npimport pandas as pdsalaries_full = pd.read_csv( "9/salaries_clean.csv" , encoding="latin1" , index_col="salary_id" ) columns = [ "location_latitude" , "location_longitude" , "total_experience_years" , "employer_experience_years" , "annual_base_pay" , "signing_bonus" , ] salary_data = salaries_full[columns].to_numpy() salary_data
array([[ 3.7770e+01, -1.2241e+02, 1.3000e+01, 2.0000e+00, 1.2500e+05,
5.0000e+03],
[ 3.6360e+01, -9.4200e+01, 1.5000e+01, 8.0000e+00, 6.5000e+04,
nan],
[ 4.1470e+01, -8.1670e+01, 4.0000e+00, 1.0000e+00, 8.6000e+04,
5.0000e+03],
...,
[ nan, nan, 7.0000e+00, 1.0000e+00, 1.5000e+05,
0.0000e+00],
[ 3.8990e+01, -7.6930e+01, 5.0000e+00, 1.0000e+00, 7.5000e+04,
nan],
[ nan, nan, 2.0000e+00, 0.0000e+00, 5.0000e+01,
0.0000e+00]])
Numpy indexing conventirons:
(1655,)
1 2 3 4 col = np.array(columns) col=='annual_base_pay' annual_base_pay = salary_data[:,col=='annual_base_pay' ] np.mean(annual_base_pay)
nan
1 np.nanmean(annual_base_pay)
6325003.746965475
1 np.isnan(annual_base_pay)
array([[False],
[False],
[False],
...,
[False],
[False],
[False]])
1 annual_base_pay.reshape(-1 )
array([1.25e+05, 6.50e+04, 8.60e+04, ..., 1.50e+05, 7.50e+04, 5.00e+01])
1 annual_base_pay[np.isnan(annual_base_pay)]
array([nan, nan, nan, nan])
1 ~np.isnan(annual_base_pay)
array([[ True],
[ True],
[ True],
...,
[ True],
[ True],
[ True]])
1 2 3 np.all (~np.isnan(salary_data),axis=1 ) np.any (np.isnan(salary_data),axis=1 )
array([False, True, False, ..., True, True, True])
1 salary_data[~np.any (np.isnan(salary_data),axis=1 )]
array([[ 3.7770e+01, -1.2241e+02, 1.3000e+01, 2.0000e+00, 1.2500e+05,
5.0000e+03],
[ 4.1470e+01, -8.1670e+01, 4.0000e+00, 1.0000e+00, 8.6000e+04,
5.0000e+03],
[ 3.8000e+01, -9.7000e+01, 5.0000e+00, 1.5000e+00, 4.0000e+04,
0.0000e+00],
...,
[ 4.0030e+01, -7.5630e+01, 2.0000e+00, 1.0000e+00, 6.0000e+04,
0.0000e+00],
[ 3.9950e+01, -7.5160e+01, 6.0000e+00, 2.5000e+00, 1.1100e+05,
5.0000e+03],
[ 4.0020e+01, -1.0525e+02, 1.0000e+00, 5.0000e-01, 6.0500e+04,
0.0000e+00]])
array([[ 3.899e+01, -7.693e+01, 5.000e+00, 1.000e+00, 7.500e+04,
nan],
[ nan, nan, 2.000e+00, 0.000e+00, 5.000e+01,
0.000e+00]])
array([[ 3.636e+01, -9.420e+01, 1.500e+01, 8.000e+00, 6.500e+04,
nan],
[ 4.147e+01, -8.167e+01, 4.000e+00, 1.000e+00, 8.600e+04,
5.000e+03],
[ nan, nan, 4.000e+00, 0.000e+00, 1.050e+05,
5.000e+03]])
15.0
Normalizing Data
1 col_mean = np.nanmean(salary_data,axis=0 )
array([ 3.77543939e+01, -6.47122727e+01, 6.75611318e+00, 2.65616294e+00,
6.32500375e+06, 2.24056269e+04])
Note: 如果矩阵很大,可能会溢出,这样就得用for loop来写
Broad cast的矩阵,要么某些维度数量match,要么就是1或没有
\(5\times4\) ; 1
\(5\times4\) ; 4
但是\(5\times4\) 如果要每行减去一列数字,不能是一个长度为5的array,因为匹配的是innermost
index,应该reshape(1,5) 或者
[:,None]
即最后加一个维度=1
1 2 3 salary_data.mean(1 ) salary_data.mean(axis=1 )
array([21655.06, nan, 15160.8 , ..., nan, nan, nan])
1 2 3 row_mu = salary_data.mean(axis=1 ) row_mu.shape,row_mu[:,None ].shape,row_mu[:,None ,None ].shape
((1655,), (1655, 1), (1655, 1, 1))
1 salary_data-row_mu[:,None ]
array([[-21617.29, -21777.47, -21642.06, -21653.06, 103344.94, -16655.06],
[ nan, nan, nan, nan, nan, nan],
[-15119.33, -15242.47, -15156.8 , -15159.8 , 70839.2 , -10160.8 ],
...,
[ nan, nan, nan, nan, nan, nan],
[ nan, nan, nan, nan, nan, nan],
[ nan, nan, nan, nan, nan, nan]])
1 np.array(1 ).shape,np.array(1 )[None ].shape
((), (1,))
1 2 row_mu = salary_data.mean(axis=1 ,keepdims=True ) row_mu.shape
(1655, 1)
1 salary_data.mean(axis=1 )
array([21655.06, nan, 15160.8 , ..., nan, nan, nan])
for
loops too slow
1 2 prices = np.array([20 ,18 ,21 ,14 ]) prices - np.minimum.accumulate(prices)
array([0, 0, 3, 0])
1 2 3 4 5 6 7 n = 10 p = 2 k = 3 x = np.random.rand(n,p) c = np.random.rand(k,p) x.shape
(10, 2)
array([[0.57019677, 0.43860151],
[0.98837384, 0.10204481],
[0.20887676, 0.16130952],
[0.65310833, 0.2532916 ],
[0.46631077, 0.24442559],
[0.15896958, 0.11037514],
[0.65632959, 0.13818295],
[0.19658236, 0.36872517],
[0.82099323, 0.09710128],
[0.83794491, 0.09609841]])
array([[0.58651293, 0.02010755],
[0.82894003, 0.00469548],
[0.67781654, 0.27000797],
[0.73519402, 0.96218855],
[0.24875314, 0.57615733],
[0.59204193, 0.57225191],
[0.22308163, 0.95274901],
[0.44712538, 0.84640867],
[0.69947928, 0.29743695],
[0.81379782, 0.39650574]])
(n,1,p)-(1,k,p)是一个(n,k,p)的array
1 2 D = x[:,None ,:]-c[None ,:] D[0 ]
array([[-0.40626269, -0.03004969],
[-0.40656432, -0.16624401],
[-0.16906681, 0.39941372]])
(10, 3, 2)
1 x_c_norm = np.linalg.norm(D,axis=2 )
1 np.argmin(x_c_norm,axis=1 )
array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)
1 2 3 4 5 6 import numpy as npk=3 a= np.array([0 ,1 ,2 ,3 ,4 ,5 ]) ans = a[:,None ]+a[None ,:] ans[:len (a)-k+1 ,:k]
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4],
[3, 4, 5]])
1 2 3 4 5 import numpy as npx = np.arange(10 ) x[np.logical_and(x>3 ,x<7 )]
array([4, 5, 6])
1 x[np.logical_or(x>3 ,x<7 )]
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
1 2 np.random.random((2 ,3 ))
array([[0.8754406 , 0.31234192, 0.21472602],
[0.1417056 , 0.6179606 , 0.0532995 ]])
1 2 np.random.normal(0 ,1 ,20 )
array([ 1.36952814, 0.39363222, 0.48644709, -0.23743297, 0.21279548,
-0.5515374 , -2.22034906, -1.41651274, 0.08532262, 0.49804278,
-2.03253663, 0.66368554, 0.28155033, 0.94521731, 0.5938768 ,
-1.57368613, -1.55284648, 0.54146951, -0.75257585, -0.23976487])
1 np.random.uniform(0 ,1 ,(2 ,4 ))
array([[0.5909873 , 0.8235658 , 0.14703172, 0.94326283],
[0.79080321, 0.34115745, 0.75083531, 0.31725324]])
1 2 3 4 5 x=np.arange(1 ,11 ) for i in range (5 ): print (np.random.choice(x,5 ,False ,x/float (sum (x))))
[ 6 10 7 9 5]
[ 9 8 10 7 6]
[ 2 8 9 10 5]
[10 9 8 3 6]
[7 8 6 5 9]
1 2 3 x = np.arange(10 ) np.random.shuffle(x) x
array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])
1 np.random.permutation(x)
array([1, 9, 5, 7, 3, 4, 2, 0, 6, 8])
array([1, 7, 4, 0, 8, 3, 5, 2, 6, 9])
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
Pandas
basic structures
Series
One dimensional labeled array
DataFrame
tabel of rows, with labeled columns
1 2 3 4 import numpy as npa= np.array([0 ,1 ,2 ,3 ,4 ,5 ]) (a[:,None ]+a[None ,:])[:4 ,:3 ]
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4],
[3, 4, 5]])
1 2 import pandas as pdflight = pd.read_csv("10/flights.csv.gz" )
Series
can create a pandas series from any array-like structure(dict,numpy
array,list)
和numpy不同的是:有label
1 2 3 4 import numpy as npnumbers = np.random.randn(5 ) s = pd.Series(numbers) s
0 1.556860
1 0.187127
2 -0.245997
3 1.342677
4 -0.117419
dtype: float64
1 2 3 idx = ['a' ,'b' ,'c' ,'d' ,'e' ] s = pd.Series(numbers,index=idx) s
a 1.556860
b 0.187127
c -0.245997
d 1.342677
e -0.117419
dtype: float64
providing too few or too many indices is a ValueError
1 pd.Series(np.random.rand(5 ),index=['a' ,'b' ,'c' ,'d' ,'e' ])
a 0.996875
b 0.893754
c 0.925473
d 0.853753
e 0.500225
dtype: float64
can create a Series from a dict
1 2 d = {"dog" :1 ,"cat" :2 } pd.Series(d)
dog 1
cat 2
dtype: int64
1 pd.Series(d,index=["dog" ,"cat" ,"pig" ])
dog 1.0
cat 2.0
pig NaN
dtype: float64
字典里没有pig,所以这里赋值给pig:NaN
inf==inf+X
np.nan不等于np.nan
用np.isnan()判断
或者用变量不等于自身判断nan,这是一个变量不等于自身的特例
1.5568595147700188
-0.11741940757185318
a 1.556860
b 0.187127
c -0.245997
dtype: float64
1.5568595147700188
这里需要注意的是:用:以及index名字分割数组时,:后面的也会包含进去。但是数字不会,哪怕索引名字是数字。
a 1.556860
b 0.187127
c -0.245997
dtype: float64
a 1.556860
b 0.187127
c -0.245997
dtype: float64
a 4.743900
b 1.205781
c 0.781924
dtype: float64
indices need not be unique
1 2 s = pd.Series([2 ,3 ,5 ,7 ,11 ],index=['a' ,'a' ,'a' ,'c' ,'d' ]) s
a 2
a 3
a 5
c 7
d 11
dtype: int64
a 2
a 3
a 5
dtype: int64
2
a 2
a 3
a 5
c 7
dtype: int64
a 2
a 3
a 5
dtype: int64
和np一样,可以用boolean series索引
c 7
d 11
dtype: int64
a False
a False
a False
c True
d True
dtype: bool
Series objects are dict-like, in that we can access and update
entries via their keys.
Like dictionary, accessing a non-existent key is a KeyError.
Series also support in
operator.
also support get
method in dictionary
entries in Series can be mix type, but no sequence
a 2
a 3
a 5
c 7
d 11
Name: random int, dtype: int64
1 2 s = pd.Series(['dog' ,'fish' ,'elephant' ]) s.map (lambda s:len (s))
0 3
1 4
2 8
dtype: int64
0 3
1 4
2 8
dtype: int64
map
in Series allows to change values based on another
Series.
1 2 s = pd.Series(['fruit' ,'animal' ,'animal' ,'fruit' ,'fruit' ],index=['apple' ,'cat' ,'goat' ,'banana' ,'kiwi' ]) s
apple fruit
cat animal
goat animal
banana fruit
kiwi fruit
dtype: object
1 2 t = pd.Series({'fruit' :0 ,'animal' :1 }) s.map (t)
apple 0
cat 1
goat 1
banana 0
kiwi 0
dtype: int64
DataFrame
each column can have its own index. DataFrame will have all the
index. 缺失值NaN
1 2 3 d = {'A' :pd.Series([1 ,2 ,3 ],index={'cat' ,'dog' ,'bird' }),'B' :{'dog' :1 ,'cat' :2 ,'bird' :3 ,'goat' :4 }} df = pd.DataFrame(d) df
A
B
bird
2.0
3
cat
3.0
2
dog
1.0
1
goat
NaN
4
bird 2.0
cat 3.0
dog 1.0
goat NaN
Name: A, dtype: float64
By default: rows and columns are ordered alphabetically
Index(['A', 'B'], dtype='object')
Index(['bird', 'cat', 'dog', 'goat'], dtype='object')
A
B
C
bird
2.0
3
4.0
cat
3.0
2
6.0
dog
1.0
1
2.0
goat
NaN
4
NaN
1 flight['max_delay' ]=list (map (max ,flight['arr_delay' ],flight['dep_delay' ]))
1 2 v = np.array([1 ,2 ,3 ,4 ,5 ]) np.maximum(v,v*2 )
array([ 2, 4, 6, 8, 10])
5
1 flight['max_delay' ] = np.maximum(flight['arr_delay' ],flight['dep_delay' ])
1 2 flight['num' ]=0 flight['num' ]
0 0
1 0
2 0
3 0
4 0
..
336771 0
336772 0
336773 0
336774 0
336775 0
Name: num, Length: 336776, dtype: int64
1 2 del flight['num' ]flight.columns
Index(['year', 'month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay',
'arr_time', 'sched_arr_time', 'arr_delay', 'carrier', 'flight',
'tailnum', 'origin', 'dest', 'air_time', 'distance', 'hour', 'minute',
'time_hour', 'max_delay'],
dtype='object')
A 3.0
B 2.0
Name: cat, dtype: float64
A 3.0
B 2.0
Name: cat, dtype: float64
cat 3.0
dog 1.0
Name: A, dtype: float64
1 2 3 df1 = pd.DataFrame(np.random.randn(8 ,4 ),columns=['a' ,'b' ,'c' ,'d' ]) df2 = pd.DataFrame(np.random.randn(5 ,3 ),columns=['a' ,'b' ,'c' ]) df1+df2
a
b
c
d
0
0.150499
-0.084967
-1.304810
NaN
1
-1.723753
0.809527
-3.027434
NaN
2
-1.770957
-1.115960
-0.606153
NaN
3
-1.322787
-0.154228
0.316819
NaN
4
0.357720
2.129364
2.252070
NaN
5
NaN
NaN
NaN
NaN
6
NaN
NaN
NaN
NaN
7
NaN
NaN
NaN
NaN
a
b
c
0
0.000000
0.000000
0.000000
1
-0.406540
-0.015429
-0.683493
2
-0.095646
-1.092577
1.542312
3
-0.462943
-0.349725
1.856521
4
0.749604
1.157111
1.999972
a True
b True
c True
d True
dtype: bool
a False
b False
c False
d False
dtype: bool
0
1
2
3
4
5
6
7
a
0.039975
1.607250
2.643456
0.656756
0.117268
0.264463
0.471706
1.462087
b
0.012506
0.636951
0.002525
0.028437
0.893762
2.970262
0.084393
3.932983
c
0.002420
1.184460
0.797188
0.080707
2.273190
0.834139
2.697453
0.488765
d
0.650234
0.199167
0.196074
0.403005
0.352569
1.245693
0.628392
0.021431
a
b
c
d
a
7.262961
-4.896682
2.348484
-2.077213
b
-4.896682
8.561819
3.042636
2.741910
c
2.348484
3.042636
8.358324
-0.150733
d
-2.077213
2.741910
-0.150733
3.696565
a
b
c
d
a
7.262961
-4.896682
2.348484
-2.077213
b
-4.896682
8.561819
3.042636
2.741910
c
2.348484
3.042636
8.358324
-0.150733
d
-2.077213
2.741910
-0.150733
3.696565
1 df.dropna(axis=1 ,how='any' )
B
bird
3
cat
2
dog
1
goat
4
A
B
bird
2.0
3
cat
3.0
2
dog
1.0
1
1 df.dropna(axis=1 ,how="all" )
A
B
bird
2.0
3
cat
3.0
2
dog
1.0
1
goat
NaN
4
pandas supports read/write for a wide range of different file
format
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 336776 non-null int64
1 month 336776 non-null int64
2 day 336776 non-null int64
3 dep_time 328521 non-null float64
4 sched_dep_time 336776 non-null int64
5 dep_delay 328521 non-null float64
6 arr_time 328063 non-null float64
7 sched_arr_time 336776 non-null int64
8 arr_delay 327346 non-null float64
9 carrier 336776 non-null object
10 flight 336776 non-null int64
11 tailnum 334264 non-null object
12 origin 336776 non-null object
13 dest 336776 non-null object
14 air_time 327346 non-null float64
15 distance 336776 non-null int64
16 hour 336776 non-null int64
17 minute 336776 non-null int64
18 time_hour 336776 non-null object
19 max_delay 327346 non-null float64
dtypes: float64(6), int64(9), object(5)
memory usage: 51.4+ MB
year
month
day
dep_time
sched_dep_time
dep_delay
arr_time
sched_arr_time
arr_delay
carrier
flight
tailnum
origin
dest
air_time
distance
hour
minute
time_hour
max_delay
0
2013
1
1
517.0
515
2.0
830.0
819
11.0
UA
1545
N14228
EWR
IAH
227.0
1400
5
15
2013-01-01T10:00:00Z
11.0
1
2013
1
1
533.0
529
4.0
850.0
830
20.0
UA
1714
N24211
LGA
IAH
227.0
1416
5
29
2013-01-01T10:00:00Z
20.0
2
2013
1
1
542.0
540
2.0
923.0
850
33.0
AA
1141
N619AA
JFK
MIA
160.0
1089
5
40
2013-01-01T10:00:00Z
33.0
3
2013
1
1
544.0
545
-1.0
1004.0
1022
-18.0
B6
725
N804JB
JFK
BQN
183.0
1576
5
45
2013-01-01T10:00:00Z
-1.0
4
2013
1
1
554.0
600
-6.0
812.0
837
-25.0
DL
461
N668DN
LGA
ATL
116.0
762
6
0
2013-01-01T11:00:00Z
-6.0
year
month
day
dep_time
sched_dep_time
dep_delay
arr_time
sched_arr_time
arr_delay
carrier
flight
tailnum
origin
dest
air_time
distance
hour
minute
time_hour
max_delay
336771
2013
9
30
NaN
1455
NaN
NaN
1634
NaN
9E
3393
NaN
JFK
DCA
NaN
213
14
55
2013-09-30T18:00:00Z
NaN
336772
2013
9
30
NaN
2200
NaN
NaN
2312
NaN
9E
3525
NaN
LGA
SYR
NaN
198
22
0
2013-10-01T02:00:00Z
NaN
336773
2013
9
30
NaN
1210
NaN
NaN
1330
NaN
MQ
3461
N535MQ
LGA
BNA
NaN
764
12
10
2013-09-30T16:00:00Z
NaN
336774
2013
9
30
NaN
1159
NaN
NaN
1344
NaN
MQ
3572
N511MQ
LGA
CLE
NaN
419
11
59
2013-09-30T15:00:00Z
NaN
336775
2013
9
30
NaN
840
NaN
NaN
1020
NaN
MQ
3531
N839MQ
LGA
RDU
NaN
431
8
40
2013-09-30T12:00:00Z
NaN
1 2 3 df1 = df*2 df2 = df+df (df1==df2).all ()
A False
B True
dtype: bool
False
True
A 4.0
B 5.0
dtype: float64
bird 5.0
cat 5.0
dog 2.0
goat 8.0
dtype: float64
A 4.0
B 5.0
dtype: float64
np.mean作用于dataframe时,自动跳过nan,但是on np.array时,返回nan
year
month
day
dep_time
sched_dep_time
dep_delay
arr_time
sched_arr_time
arr_delay
flight
air_time
distance
hour
minute
max_delay
count
336776.0
336776.000000
336776.000000
328521.000000
336776.000000
328521.000000
328063.000000
336776.000000
327346.000000
336776.000000
327346.000000
336776.000000
336776.000000
336776.000000
327346.000000
mean
2013.0
6.548510
15.710787
1349.109947
1344.254840
12.639070
1502.054999
1536.380220
6.895377
1971.923620
150.686460
1039.912604
13.180247
26.230100
16.969943
std
0.0
3.414457
8.768607
488.281791
467.335756
40.210061
533.264132
497.457142
44.633292
1632.471938
93.688305
733.233033
4.661316
19.300846
42.376455
min
2013.0
1.000000
1.000000
1.000000
106.000000
-43.000000
1.000000
1.000000
-86.000000
1.000000
20.000000
17.000000
1.000000
0.000000
-33.000000
25%
2013.0
4.000000
8.000000
907.000000
906.000000
-5.000000
1104.000000
1124.000000
-17.000000
553.000000
82.000000
502.000000
9.000000
8.000000
-4.000000
50%
2013.0
7.000000
16.000000
1401.000000
1359.000000
-2.000000
1535.000000
1556.000000
-5.000000
1496.000000
129.000000
872.000000
13.000000
29.000000
1.000000
75%
2013.0
10.000000
23.000000
1744.000000
1729.000000
11.000000
1940.000000
1945.000000
14.000000
3465.000000
192.000000
1389.000000
17.000000
44.000000
19.000000
max
2013.0
12.000000
31.000000
2400.000000
2359.000000
1301.000000
2400.000000
2359.000000
1272.000000
8500.000000
695.000000
4983.000000
23.000000
59.000000
1301.000000
pipe
method: method
chaining
1 2 3 4 5 f = lambda x : x**2 g = lambda x : x+1 h = lambda x : 2 *x df = pd.DataFrame(np.random.randn(5 ,3 ),columns=['A' ,'B' ,'C' ]) df
A
B
C
0
-0.413218
0.348975
-1.313533
1
1.658054
0.495508
2.021334
2
-1.136187
0.002771
-1.651880
3
-1.125284
0.205869
-0.670324
4
1.085758
-0.103062
2.407278
A
B
C
0
2.341498
2.243567
5.450740
1
7.498284
2.491057
10.171579
2
4.581844
2.000015
7.457418
3
4.532527
2.084764
2.898668
4
4.357740
2.021243
13.589975
1 df.pipe(f).pipe(g).pipe(h)
A
B
C
0
2.341498
2.243567
5.450740
1
7.498284
2.491057
10.171579
2
4.581844
2.000015
7.457418
3
4.532527
2.084764
2.898668
4
4.357740
2.021243
13.589975
row- and column-wise
function: apply()
Pandas II
1 2 import pandas as pdimport numpy as np
1 2 s = pd.Series(np.random.randn(8 )) s.pct_change()
0 NaN
1 -3.496694
2 -1.735719
3 -0.905374
4 2.515764
5 -1.755356
6 0.058851
7 2.335997
dtype: float64
1 2 3 s1 = pd.Series(np.random.randn(100 )) s2 = pd.Series(0.1 *s1+np.random.randn(100 )) s1.cov(s2)
0.2999112016156914
1 2 s = pd.Series(np.random.randn(5 ),index=list ('abcde' )) s
a -0.709341
b 0.346186
c 0.136598
d -0.826597
e -1.181088
dtype: float64
a 3.0
b 5.0
c 4.0
d 2.0
e 1.0
dtype: float64
a 3.5
b 3.5
c 5.0
d 2.0
e 1.0
dtype: float64
1 2 tsdf = pd.DataFrame(np.random.randn(10 ,3 ),columns=['DOW' ,'NASDAQ' ,'SP500' ],index=pd.date_range('1/1/2000' ,periods=10 )) tsdf
DOW
NASDAQ
SP500
2000-01-01
0.656688
-1.531741
-0.024392
2000-01-02
-0.706209
0.325865
-0.329847
2000-01-03
-0.127025
0.806435
-0.910426
2000-01-04
-0.401866
0.191501
-0.204383
2000-01-05
-1.283493
1.175840
-1.195107
2000-01-06
0.467726
-0.255641
0.056299
2000-01-07
-0.371994
0.140242
-0.650468
2000-01-08
0.818794
0.461664
1.267799
2000-01-09
2.121635
1.405991
-0.179316
2000-01-10
-1.062810
0.308460
1.338484
apply to each column
1 tsdf.agg([np.median,np.mean,np.std])
DOW
NASDAQ
SP500
median
-0.249509
0.317162
-0.191849
mean
0.011145
0.302862
-0.083136
std
1.022643
0.814786
0.829758
agg can take dict as functions
1 2 tsdf.agg({'DOW' :'mean' ,'NASDAQ' :'median' ,'SP500' :'max' })
DOW 0.011145
NASDAQ 0.317162
SP500 1.338484
dtype: float64
1 tsdf.corr(method='spearman' )
DOW
NASDAQ
SP500
DOW
1.000000
-0.018182
0.369697
NASDAQ
-0.018182
1.000000
-0.345455
SP500
0.369697
-0.345455
1.000000
1 tsdf.corr(method='pearson' )
DOW
NASDAQ
SP500
DOW
1.000000
-0.012922
0.191341
NASDAQ
-0.012922
1.000000
-0.222936
SP500
0.191341
-0.222936
1.000000
dataframe ranks columns(default)
DOW
NASDAQ
SP500
2000-01-01
8.0
1.0
7.0
2000-01-02
3.0
6.0
4.0
2000-01-03
6.0
8.0
2.0
2000-01-04
4.0
4.0
5.0
2000-01-05
1.0
9.0
1.0
2000-01-06
7.0
2.0
8.0
2000-01-07
5.0
3.0
3.0
2000-01-08
9.0
7.0
9.0
2000-01-09
10.0
10.0
6.0
2000-01-10
2.0
5.0
10.0
DOW
NASDAQ
SP500
2000-01-01
3.0
1.0
2.0
2000-01-02
1.0
3.0
2.0
2000-01-03
2.0
3.0
1.0
2000-01-04
1.0
3.0
2.0
2000-01-05
1.0
3.0
2.0
2000-01-06
3.0
1.0
2.0
2000-01-07
2.0
3.0
1.0
2000-01-08
2.0
1.0
3.0
2000-01-09
3.0
2.0
1.0
2000-01-10
1.0
2.0
3.0
iterating over Series get an iterator on values
iterating over dataframe get an iterator over column names
df.iteritems
df.iterrows
pandas,Group By
1 2 df = pd.DataFrame({'A' :['plant' ,'animal' ,'plant' ,'plant' ],'B' :['apple' ,'goat' ,'kiwi' ,'grape' ],'C' :np.random.randn(4 ),'D' :np.random.randn(4 )}) df
A
B
C
D
0
plant
apple
2.392546
-0.554108
1
animal
goat
-1.292467
0.800007
2
plant
kiwi
0.718886
0.600089
3
plant
grape
-0.665115
-1.205696
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000155D469CD00>
{'animal': [1], 'plant': [0, 2, 3]}
C
D
A
animal
-1.292467
0.800007
plant
0.815439
-0.386571
1 2 3 index = pd.date_range('10/1/1999' ,periods=1100 ) ts = pd.Series(np.random.normal(0.5 ,2 ,1100 ),index) ts.head()
1999-10-01 1.318674
1999-10-02 2.784437
1999-10-03 4.774021
1999-10-04 0.017314
1999-10-05 2.678237
Freq: D, dtype: float64
1 2 3 4 key = lambda d: d.year zscore = lambda x: (x-x.mean())/x.std() transformed = ts.groupby(key).transform(zscore) transformed.head()
1999-10-01 0.409280
1999-10-02 1.168705
1999-10-03 2.199526
1999-10-04 -0.264966
1999-10-05 1.113682
Freq: D, dtype: float64
filter return ungrouped resulted
1 2 sf = pd.Series([1 ,1 ,2 ,2 ,3 ,3 ]) sf.groupby(sf).filter (lambda x: x.sum ()>2 )
2 2
3 2
4 3
5 3
dtype: int64
df.groupby().agg()
df.groupby().transform()
df.concat
df.pivot
1 2 3 4 5 6 7 8 9 10 11 np.random.seed(0 ) df4 = pd.DataFrame({ "stock" :['A' , 'B' , 'C' , 'A' , 'B' ,'C' ,'A' ,'C' ,'A' ,'B' ,'C' ,'A' ,'B' ,'C' ,'D' ,'D' ,'D' ,'D' ,'A' ,'B' ,'C' ,'D' ], "date" :['20210104' ,'20210104' ,'20210104' ,'20210102' ,'20210102' ,'20210102' , '20210103' , '20210103' , '20210101' ,'20210101' ,'20210101' , '20210105' ,'20210105' ,'20210105' ,'20210102' ,'20210103' ,'20210104' ,'20210105' , '20210106' , '20210106' , '20210106' ,'20210106' ], "daily_ret" :np.random.normal(0 ,0.05 ,22 )}) df4_stacked = df4.pivot(index='date' ,columns='stock' ,values='daily_ret' ) df4_stacked
stock
A
B
C
D
date
20210101
-0.005161
0.020530
0.007202
NaN
20210102
0.112045
0.093378
-0.048864
0.022193
20210103
0.047504
NaN
-0.007568
0.016684
20210104
0.088203
0.020008
0.048937
0.074704
20210105
0.072714
0.038052
0.006084
-0.010258
20210106
0.015653
-0.042705
-0.127649
0.032681
1 2 s = df4_stacked.stack() s
date stock
20210101 A -0.005161
B 0.020530
C 0.007202
20210102 A 0.112045
B 0.093378
C -0.048864
D 0.022193
20210103 A 0.047504
C -0.007568
D 0.016684
20210104 A 0.088203
B 0.020008
C 0.048937
D 0.074704
20210105 A 0.072714
B 0.038052
C 0.006084
D -0.010258
20210106 A 0.015653
B -0.042705
C -0.127649
D 0.032681
dtype: float64
1 2 s['20210104' ,'A' ] s['20210104' ]['A' ]
0.0882026172983832
matplotlib
1 2 3 4 import matplotlib.pyplot as plt%matplotlib inline x = np.arange(10 ) _ = plt.plot(x**2 )
1 2 import matplotlib as mplmpl.style.available
['Solarize_Light2',
'_classic_test_patch',
'bmh',
'classic',
'dark_background',
'fast',
'fivethirtyeight',
'ggplot',
'grayscale',
'seaborn',
'seaborn-bright',
'seaborn-colorblind',
'seaborn-dark',
'seaborn-dark-palette',
'seaborn-darkgrid',
'seaborn-deep',
'seaborn-muted',
'seaborn-notebook',
'seaborn-paper',
'seaborn-pastel',
'seaborn-poster',
'seaborn-talk',
'seaborn-ticks',
'seaborn-white',
'seaborn-whitegrid',
'tableau-colorblind10']
1 2 mpl.style.use('ggplot' ) mpl.style.use('seaborn-white' )
1 2 3 %matplotlib inline x = np.arange(10 ) _ = plt.plot(x**2 )
1 2 3 t = np.arange(10 ) s = np.random.normal(1 ,1 ,10 ) _ = plt.bar(t,s,align='center' )
1 2 3 fig,axs = plt.subplots(nrows=3 ,ncols=3 ) x = np.arange(0 ,100 ) axs[0 ][0 ].plot(x,x**2 )
[<matplotlib.lines.Line2D at 0x155d7666eb0>]
Regular Expressions and
Text Encoding
Unicode ASCII sebset of UTF-8
re.match()
at start of string
re.search()
any part of string
re.findall()
find all matches of parttern in the
string
https://rex101.com
1 2 import rere.match ('cat' ,'cats and dogs' )
<re.Match object; span=(0, 3), match='cat'>
1 2 regex = 'cat' re.match (regex,'fat on mat' ) is None
True
1 2 3 4 if re.match (regex,'fat on mat' ): print ('matches' ) else : print ('Not' )
Not
1 re.search('cat' ,'rains cats and dogs' )
<re.Match object; span=(6, 9), match='cat'>
1 2 if re.search('cat' ,'rains cats and dogs' ): print ('matches' )
matches
1 re.findall('cat' ,'rains cats and cat' )
['cat', 'cat']
<re.Match object; span=(0, 2), match='\\$'>
<re.Match object; span=(0, 1), match='$'>
raw text
is new line
r"" == "\n", raw text
推荐在用字符串的时候都加上r
1 print (r"\n1.00" ,"\n1.00" )
\n1.00
1.00
<re.Match object; span=(0, 1), match='$'>
<re.Match object; span=(0, 1), match='$'>
1 re.findall("[aeiou]" ,"cats and dog" )
['a', 'a', 'o']
1 re.findall("[a-z ]" ,"cats and Dog" )
['c', 'a', 't', 's', ' ', 'a', 'n', 'd', ' ', 'o', 'g']
1 re.findall("[a-z]" ,"cats and Dog" )
['c', 'a', 't', 's', 'a', 'n', 'd', 'o', 'g']
1 re.findall("[A-Z]" ,"cats and Dog" )
['D']
1 re.findall("[a-zA-Z]" ,"cats and Dog" )
['c', 'a', 't', 's', 'a', 'n', 'd', 'D', 'o', 'g']
1 re.findall("[^a-zA-Z]" ,"cats and Dog" )
[' ', ' ']
1 re.findall("[a\-z]" ,"i am a z-ip" )
['a', 'a', 'z', '-']
1 re.findall("[(+*)^]" ,"1+1+2^1=3*(1)" )
['+', '+', '^', '*', '(', ')']
1 re.findall("[^(+*)^]" ,"1+1+2^1=3*(1)" )
['1', '1', '2', '1', '=', '3', '1']
: space, tab, new line, = [ ]
r"" = r"[a-zA-Z_]"
digits = r"[0-9]"
"^": not in the [], start of a line
new line 开头有个invisible ^, end line have invisible $
"$": end of a line
".": including everything
只想匹配句号的话用:"."
"o..r" match "o任意两个字母r"
word boundary: a 以a结尾的单词
如果想neglect上述,只需要大写,例如:, neglect all the words
+: "e+" match any number of e's
: "e "
1 re.findall("oe+" ,"oeuvree" )
['oe']
1 re.findall("[oe]+" ,"oeuvree" )
['oe', 'ee']
1 re.findall("[whe]*" ,"while wheeee weheee" )
['wh', '', '', 'e', '', 'wheeee', '', 'weheee', '']
‘*’ : zero or more of the previous item
‘+’ : one or more of the previous item
‘?’ : zero or one of the previous item
‘{4}’ : exactly four of the previous item
‘{3,}’ : three or more of previous item
‘{2,5}’ : between two and five (inclusive) of previous item
1 re.findall("whe*" ,"while wheeee weheee heee weee" )
['wh', 'wheeee']
‘|’ (“pipe”) is a special character that allows one to specify “or”
clauses
Example: I want to match the word “cat” or the word “dog”
Solution: ‘(cat|dog)’ 记得加括号!!!
Note: parentheses are not strictly necessary here, but parentheses
tend to make for easier reading and avoid possible ambiguity. It’s a
good habit to just use them always.
1 re.findall(r"a|aa|aaa" ,"aaaa" )
['a', 'a', 'a', 'a']
1 re.findall(r"a+" ,"aaaa" )
['aaaa']
'?' modifies operators like '+' and '*' not be greedy,get lazy
matching
1 re.findall(r"a+?" ,"aaaa" )
['a', 'a', 'a', 'a']
1 re.findall("\w+@\w+\.\w+" ,"your email is kobe@icloud.com, his email is jacob@umich.edu" )
['kobe@icloud.com', 'jacob@umich.edu']
capture group ():
1 2 3 4 regex = r'(\w+@\w+\.\w+)' s = "your email is kobe@icloud.com kobe@icloud.com his email is jacob@umich.edu" m=re.findall(regex,s) m
['kobe@icloud.com', 'kobe@icloud.com', 'jacob@umich.edu']
backreferences
in paranthesis() occur once again
([a-zA-Z]+).*\1 首尾以相同字段开始和结束,中间任意
1 2 3 4 regex = r'(\w+)\1' s = "freefree" m = re.search(regex,s) m.group(1 )
'free'
1 2 3 4 regex = r'\w+' s = "nonrepeat" m = re.search(regex,s) m.group(0 )
'nonrepeat'
SQL
每列一个type
key: uniquely identify a record
sqliteonline.com
features:
atomicity
consistency
isolation
durability
https://www.w3schools.com/sql/
select
SELECT * FROM countries
SELECT id, name FROM countries
SELECT * FROM countries WHERE [filter]
<> !=
= ==
SELECT * FROM countries WHERE [] BETWEEN 86 AND 90
SELECT * FROM countries WHERE [filter] IS NULL
SORT
ORDER BY
SELECT [columns] FROM [table] ORDER BY [column] [ASC|DESC]
SELECT [columns] FROM countries ORDER BY phonecode ASC, name
DESC
等价于 SELECT [columns] FROM countries ORDER BY phonecode, name
DESC
默认ASC
DISTINCT
SELECT DISTINCT [columns] FROM [table]
WHERE
支持set
SELECT id from t_student WHERE major IN
("Mathematics","statistics")
SELECT id from t_student WHERE major NOT IN
("Mathematics","statistics")
支持regex-like matching
SELCET id from t_student WHERE first_name LIKE "M%"
%: match any number of letters
_: match one number of letters
GROUP BY
SELECT customer_id, SUM(dollar_amount) FROM t_transactions GROUP BY
customer_id
COUNT, AVG, MIN, MAX: aggregate function
SECLECT phonecode, COUNT() FROM countries GROUP BY phonecode
HAVING: filer results after GROUP BY using the HAVING keyword
SELECT customer_id, SUM(dollar_amount) AS total_dolar FROM
t_transactions GROUP BY customer_id HAVING total_dolar >50
JOIN: merging tables
INNER JOIN
SELECT id, name, color FROM t_student INNER JOIN t_personal ON
t_students.id=t_personal.id
LEFT JOIN
RIGHT JOIN
FULL JOIN
INSERT INTO
INSERT INTO table_name [col1, col2, col3, …] VALUES value1, value2,
value3, …
UPDATE
UPDATE table_name SET col1=value1,col2=value2, WHERE condition
Caution: if WHERE is left empty, you'll delete/modify the whole
tanle!
DELETE
DELETE FROM table_name WHERE condition
CREATE TABLE
CREATE TABLE table_name [col1 datatype, col2 datatype, …]
DROP TABLE
SELECT * FROM sqlite_master
SELECT * FROM sqlite_master WHERE type="table"
课件里的python例子
1 2 3 import sqlite3con = sqlite3.connect('ps8/flights.sqlite' ) cur = con.cursor()
<sqlite3.Connection at 0x2615592b040>
1 2 res = cur.execute("select * from flights limit 10" ) res
<sqlite3.Cursor at 0x2615594d3c0>
(Actually, res
is just the original cursor itself):
True
1 2 for row in res: print (row[0 ])
2013
2013
2013
2013
2013
2013
2013
2013
2013
2013
Fetching results
To retrieve results, use the cur.fetchone()
or
cur.fetchall()
methods:
1 2 3 res = cur.execute("select * from flights limit 10" ) cur.fetchone()
(2013,
1,
1,
517,
515,
2.0,
830,
819,
11.0,
'UA',
1545,
'N14228',
'EWR',
'IAH',
227.0,
1400.0,
5.0,
15.0,
1357034400.0)
This returns a tuple containing one entry for each column in our
SELECT
statement.
If you want all the rows, use cur.fetchall()
:
It is an iterator
9
Notice that .fetchall()
works like an iterator--we
SELECT
ed 10 rows, fetched one of them using
.fetchone()
, and then fetched the remaining 9 using
.fetchall()
. If we run this again we'll get zero rows back
because we already fetched everything:
0
Parameterized queries
Sometimes our queries must depend on user input. Consider the
following query which returns all the flights that happened in a certain
month:
1 2 3 4 5 def num_flights_in_month (month ): query = """SELECT COUNT() FROM flights WHERE month = %d""" % month return cur.execute(query).fetchone()[0 ] num_flights_in_month(1 )
27004
For queries that depend on a parameter, we can use the special
placeholder ?
to have the database do parameter
substitution for us:
1 2 3 4 5 def num_flights_in_month (m ): query = """SELECT COUNT() FROM flights WHERE month = ?""" return cur.execute(query, [m]).fetchone()[0 ] num_flights_in_month(1 )
27004
Using multiple placeholders is possible:
1 2 3 4 5 def num_flights_on_day (m, d ): query = """SELECT COUNT() FROM flights WHERE month = ? AND day = ?""" return cur.execute(query, [m, d]).fetchone()[0 ] num_flights_on_day(12 , 25 )
719
Notice that, even if there is just one parameter, you have to pass in
all the parameters as a list
.
Why use placeholders
Consider the following table:
1 2 3 4 5 6 7 8 scon = sqlite3.connect(":memory:" ) scon.execute('CREATE TABLE users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT, age INT, password TEXT)' ) scon.execute("""INSERT INTO users(username, age, password) VALUES ('Jonathan', 40, 'beeri$g00d'), ('Sarah', 41, 'wat3rp0l0'), ('Julian', 2, 'il0vecr0ut0n3') """ )scon.commit()
Here's our table:
1 2 import pandas as pdpd.read_sql_query("SELECT * from users" , scon)
id
username
age
password
0
1
Jonathan
40
beeri$g00d
1
2
Sarah
41
wat3rp0l0
2
3
Julian
2
il0vecr0ut0n3
Now let's say we have a function which takes a user-supplied
parameter, their username, and returns their user information:
1 2 3 4 5 6 def user_info (username ): q = "SELECT * FROM users WHERE username = '%s'" % username print (q) return pd.read_sql_query(q, scon) user_info('Jonathan' )
SELECT * FROM users WHERE username = 'Jonathan'
id
username
age
password
0
1
Jonathan
40
beeri$g00d
This works great assuming that what got passed in is actually a
username, but a malicious user could do the following:
1 user_info("""abc' or 'a'='a""" )
SELECT * FROM users WHERE username = 'abc' or 'a'='a'
id
username
age
password
0
1
Jonathan
40
beeri$g00d
1
2
Sarah
41
wat3rp0l0
2
3
Julian
2
il0vecr0ut0n3
This is known as an injection attack . For this
reason, we do not passed user input directly into a query string.
Instead we use placeholders, and rely on the SQL engine itself to
perform the substitution:
1 2 3 4 5 6 def safe_user_info (username ): q = "SELECT * FROM users WHERE username = ?" print (q) return pd.read_sql_query(q, scon, params=[username]) safe_user_info('Jonathan' )
SELECT * FROM users WHERE username = ?
id
username
age
password
0
1
Jonathan
40
beeri$g00d
1 safe_user_info("""abc' or 'a'='a""" )
SELECT * FROM users WHERE username = ?
Ten questions about
flights
Now we will use SQL queries to various questions about the
flights
dataset.
Which airport is the busiest?
We need to aggregate flights
by the origin
column, and they count the number of rows in each group.
1 cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin" ).fetchall()
[('EWR', 120835), ('JFK', 111279), ('LGA', 104662)]
1 cur.execute("SELECT origin, COUNT() FROM flights GROUP BY origin" ).description
(('origin', None, None, None, None, None, None),
('COUNT()', None, None, None, None, None, None))
1 pd.read_sql_query("SELECT origin, COUNT() FROM flights GROUP BY origin" ,con)
origin
COUNT()
0
EWR
120835
1
JFK
111279
2
LGA
104662
Which month was coldest?
We need to aggregate weather
by month
, and
the compute some measure of coldness.
1 cur.execute("SELECT month, AVG(temp) FROM weather GROUP BY month" ).fetchall()
[(1, 35.63566037735852),
(2, 34.27059701492533),
(3, 39.880071845532164),
(4, 51.745641500694774),
(5, 61.79500000000004),
(6, 72.18399999999997),
(7, 80.06622082585272),
(8, 74.4684657039714),
(9, 67.3712922649374),
(10, 60.07113019891505),
(11, 44.9904343764596),
(12, 38.44180037313427)]
What
is the most popular carrier at each of the three NYC airports?
This one is a bit trickier. We start by counting the number of
flights for each combination of carrier
and
origin
:
1 cur.execute("SELECT COUNT(), carrier, origin FROM flights GROUP BY carrier, origin" ).fetchall()
[(1268, '9E', 'EWR'),
(14651, '9E', 'JFK'),
(2541, '9E', 'LGA'),
(3487, 'AA', 'EWR'),
(13783, 'AA', 'JFK'),
(15459, 'AA', 'LGA'),
(714, 'AS', 'EWR'),
(6557, 'B6', 'EWR'),
(42076, 'B6', 'JFK'),
(6002, 'B6', 'LGA'),
(4342, 'DL', 'EWR'),
(20701, 'DL', 'JFK'),
(23067, 'DL', 'LGA'),
(43939, 'EV', 'EWR'),
(1408, 'EV', 'JFK'),
(8826, 'EV', 'LGA'),
(685, 'F9', 'LGA'),
(3260, 'FL', 'LGA'),
(342, 'HA', 'JFK'),
(2276, 'MQ', 'EWR'),
(7193, 'MQ', 'JFK'),
(16928, 'MQ', 'LGA'),
(6, 'OO', 'EWR'),
(26, 'OO', 'LGA'),
(46087, 'UA', 'EWR'),
(4534, 'UA', 'JFK'),
(8044, 'UA', 'LGA'),
(4405, 'US', 'EWR'),
(2995, 'US', 'JFK'),
(13136, 'US', 'LGA'),
(1566, 'VX', 'EWR'),
(3596, 'VX', 'JFK'),
(6188, 'WN', 'EWR'),
(6087, 'WN', 'LGA'),
(601, 'YV', 'LGA')]
Next, we need to rank the carriers within origin by their counts.
1 2 3 4 5 6 7 8 9 cur.execute( """ SELECT RANK() OVER ( PARTITION BY origin ORDER BY n ) AS r, carrier, origin, n FROM (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin) """ ).fetchall()
[(1, 'OO', 'EWR', 6),
(2, 'AS', 'EWR', 714),
(3, '9E', 'EWR', 1268),
(4, 'VX', 'EWR', 1566),
(5, 'MQ', 'EWR', 2276),
(6, 'AA', 'EWR', 3487),
(7, 'DL', 'EWR', 4342),
(8, 'US', 'EWR', 4405),
(9, 'WN', 'EWR', 6188),
(10, 'B6', 'EWR', 6557),
(11, 'EV', 'EWR', 43939),
(12, 'UA', 'EWR', 46087),
(1, 'HA', 'JFK', 342),
(2, 'EV', 'JFK', 1408),
(3, 'US', 'JFK', 2995),
(4, 'VX', 'JFK', 3596),
(5, 'UA', 'JFK', 4534),
(6, 'MQ', 'JFK', 7193),
(7, 'AA', 'JFK', 13783),
(8, '9E', 'JFK', 14651),
(9, 'DL', 'JFK', 20701),
(10, 'B6', 'JFK', 42076),
(1, 'OO', 'LGA', 26),
(2, 'YV', 'LGA', 601),
(3, 'F9', 'LGA', 685),
(4, '9E', 'LGA', 2541),
(5, 'FL', 'LGA', 3260),
(6, 'B6', 'LGA', 6002),
(7, 'WN', 'LGA', 6087),
(8, 'UA', 'LGA', 8044),
(9, 'EV', 'LGA', 8826),
(10, 'US', 'LGA', 13136),
(11, 'AA', 'LGA', 15459),
(12, 'MQ', 'LGA', 16928),
(13, 'DL', 'LGA', 23067)]
<IPython.core.display.Javascript object>
Finally, to find the most popular carriers, we can filter this table
to only the rows where rank=1
. To do this I will use a
subquery :
SELECT * FROM (<another query>)
1 2 3 4 5 6 7 8 cur.execute(""" SELECT * FROM (SELECT RANK() OVER ( PARTITION BY origin ORDER BY n DESC ) AS r, carrier, origin, n FROM (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin) ) WHERE r = 1 """ ).fetchall()
[(1, 'UA', 'EWR', 46087), (1, 'B6', 'JFK', 42076), (1, 'DL', 'LGA', 23067)]
1 2 3 4 5 6 7 8 9 cur.execute(""" SELECT * FROM (SELECT RANK() OVER ( PARTITION BY origin ORDER BY n DESC ) AS r, carrier, origin, n FROM (SELECT COUNT() as n, carrier, origin FROM flights GROUP BY carrier, origin) ) AS subq JOIN airlines ON airlines.carrier = subq.carrier WHERE r = 1 """ ).fetchall()
[(1, 'B6', 'JFK', 42076, 'B6', 'JetBlue Airways'),
(1, 'DL', 'LGA', 23067, 'DL', 'Delta Air Lines Inc.'),
(1, 'UA', 'EWR', 46087, 'UA', 'United Air Lines Inc.')]
What are the
busiest travel weeks of the year?
We need to figure out the week of each flight, and then aggregate
them. To manipulate dates in sqlite, we have the strftime()
function :
1 cur.execute("""SELECT strftime('%W', 'now')""" ).fetchone()
('11',)
1 cur.execute("""SELECT strftime('The day of the week is %w, the week of the year is %W', 'now')""" ).fetchone()
('The day of the week is 0, the week of the year is 11',)
The Unix epoch
The time_hour
column in flights
looks
like:
1 cur.execute("""SELECT time_hour FROM flights""" ).fetchone()
(1357034400.0,)
This number represents the number of seconds that have elapsed since
midnight GMT on January 1, 1970, a date known as the Unix epoch . We can
use strftime()
on these dates provided we tell it how they
are formatted.
1 2 cur.execute("""SELECT datetime(1357034400, 'unixepoch')""" ).fetchone()
('2013-01-01 10:00:00',)
Now we can convert each date to a week of the year:
1 pd.read_sql_query("""SELECT year, month, day, strftime('%W', time_hour, 'unixepoch') from flights""" , con)
year
month
day
strftime('%W', time_hour, 'unixepoch')
0
2013
1
1
00
1
2013
1
1
00
2
2013
1
1
00
3
2013
1
1
00
4
2013
1
1
00
...
...
...
...
...
336771
2013
9
30
39
336772
2013
9
30
39
336773
2013
9
30
39
336774
2013
9
30
39
336775
2013
9
30
39
336776 rows × 4 columns
The final query looks like:
1 2 3 4 5 con.execute(""" SELECT COUNT() as n, strftime('%W', time_hour, 'unixepoch') as week FROM flights GROUP BY week ORDER BY n DESC """ ).fetchall()[:10 ]
[(6760, '27'),
(6740, '29'),
(6737, '28'),
(6735, '30'),
(6733, '31'),
(6702, '25'),
(6687, '32'),
(6675, '24'),
(6639, '23'),
(6631, '33')]
What
where the ten worst days for average departure delay?
1 2 3 4 con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay FROM flights GROUP BY year, month, day ORDER BY avg_delay DESC LIMIT 10''' ).fetchall()
[(2013, 3, 8, 83.5369211514393),
(2013, 7, 1, 56.23382519863791),
(2013, 9, 2, 53.029550827423165),
(2013, 7, 10, 52.860702151755376),
(2013, 12, 5, 52.32799013563502),
(2013, 5, 23, 51.14471968709257),
(2013, 9, 12, 49.95875),
(2013, 6, 28, 48.827784156142364),
(2013, 6, 24, 47.15741789354473),
(2013, 7, 22, 46.66704675028506)]
What where the
ten best days for departure delay?
1 2 3 4 con.execute('''SELECT year, month, day, AVG(dep_delay) AS avg_delay FROM flights GROUP BY year, month, day ORDER BY avg_delay ASC LIMIT 10''' ).fetchall()
[(2013, 9, 24, -1.3298319327731092),
(2013, 10, 2, -0.919917864476386),
(2013, 11, 9, -0.6288515406162465),
(2013, 9, 6, -0.39792746113989635),
(2013, 9, 5, -0.3877973112719752),
(2013, 10, 29, -0.34932221063607927),
(2013, 9, 7, -0.25036390101892286),
(2013, 11, 5, -0.18314255983350677),
(2013, 10, 19, -0.10703812316715543),
(2013, 10, 1, -0.09895833333333333)]
What airlines have
the most departure delay?
1 2 3 4 5 6 con.execute('''SELECT airlines.carrier, name, AVG(dep_delay) AS avg_delay FROM flights JOIN airlines on airlines.carrier=flights.carrier GROUP BY airlines.name ORDER BY avg_delay DESC LIMIT 10''' ).fetchall()
[('F9', 'Frontier Airlines Inc.', 20.215542521994134),
('EV', 'ExpressJet Airlines Inc.', 19.955389827868213),
('YV', 'Mesa Airlines Inc.', 18.996330275229358),
('FL', 'AirTran Airways Corporation', 18.72607467838092),
('WN', 'Southwest Airlines Co.', 17.71174377224199),
('9E', 'Endeavor Air Inc.', 16.725769407441433),
('B6', 'JetBlue Airways', 13.022522106740018),
('VX', 'Virgin America', 12.869421165464821),
('OO', 'SkyWest Airlines Inc.', 12.586206896551724),
('UA', 'United Air Lines Inc.', 12.106072888459614)]
What airplane flew the most
days?
Airplanes are identified by their tail number.
1 2 3 4 5 6 cur.execute('''SELECT tailnum, COUNT() AS n FROM (SELECT DISTINCT year, month, day, tailnum FROM flights) GROUP BY tailnum ORDER BY n DESC LIMIT 1 ''' ).fetchall()
[('N328AA', 336)]
Interestingly, you can Google most tail numbers: N328AA .
How many flights were
cancelled?
We'll say that a cancelled flight has dep_time
missing.
1 cur.execute('''SELECT COUNT() FROM flights WHERE dep_time IS NULL''' ).fetchone()
(0,)
Notice that we don't write = NULL
. That does not
work:
1 cur.execute('''SELECT COUNT() FROM flights WHERE dep_time = NULL''' ).fetchone()
(0,)
How
many flights were destined for airports with a one word name?
(Not including the word "airport")
Here is an example of an airport with a one-word name:
1 cur.execute("SELECT * FROM airports WHERE name = 'Packwood'" ).description
(('faa', None, None, None, None, None, None),
('name', None, None, None, None, None, None),
('lat', None, None, None, None, None, None),
('lon', None, None, None, None, None, None),
('alt', None, None, None, None, None, None),
('tz', None, None, None, None, None, None),
('dst', None, None, None, None, None, None),
('tzone', None, None, None, None, None, None))
<IPython.core.display.Javascript object>
CAST(xxx AS INTEGER)
1 2 3 4 5 6 cur.execute( """ SELECT name FROM airports WHERE name NOT LIKE '% %' """ ).fetchall()
[('Packwood',),
('Florence',),
('Arcata',),
('Kodiak',),
('Addison',),
('Alexandria',),
('Centennial',),
('Lakeland',),
('Appleton',),
('Executive',),
('Avalon',),
('Kalamazoo',),
('Bethel',),
('Beaufort',),
('Bettles',),
('KBWD',),
('CedarKey',),
('Clemson',),
('Charlottesville-Albemarle',),
('Chehalis-Centralia',),
('Champaign',),
('Yeager',),
('Tri-Cities',),
('Decatur',),
('Dillingham',),
('Dupage',),
('Destin',),
('Unalaska',),
('Orlando',),
('Bessemer',),
('Wendover',),
('Franklin',),
('Tipton',),
('Gulkana',),
('Gulfport-Biloxi',),
('Majors',),
('Dillingham',),
('Hana',),
('Homer',),
('Iliamna',),
('Immokalee',),
('Kapalua',),
('Logan-Cache',),
('Lihue',),
('Lincoln',),
('Lanai',),
('Castle',),
('Mojave',),
('Downtown',),
('Molokai',),
('Manassas',),
('MariposaYosemite',),
('Lakefront',),
('County',),
('Kahului',),
('Nome',),
('Executive',),
('Northway',),
('Platinum',),
('Renton',),
('Rochester',),
('Prosser',),
('Deadhorse',),
('Sedona',),
('Gillespie',),
('Teterboro',),
('Telluride',),
('Talkeetna',),
('Toledo',),
('Upolu',),
('Apopka',),
('Yakutat',),
('Clear',)]
<IPython.core.display.Javascript object>
1 2 3 4 5 6 7 cur.execute( """ SELECT COUNT() FROM flights INNER JOIN (SELECT faa FROM airports WHERE name NOT LIKE '% %') one_word ON one_word.faa = flights.dest """ ).fetchall()
[(190,)]
<IPython.core.display.Javascript object>