Programming in Python (1)

String

1
2
seq = "this is a new world"
seq[2::-2]
'it'
1
seq[-1::-2]
'drwwnas it'
1
2
# 倒序
seq[::-1]
'dlrow wen a si siht'
1
2
3
4
# >>> seq.split(" ")
# AttributeError: 'list' object has no attribute 'split'

"this is a new world".split(" ")
['this', 'is', 'a', 'new', 'world']
1
2
# Delete 头尾的\n...
'this\n'.strip()
'this'
1
2
3
4
5
word = ""
if word:
print('True')
else:
print('False')
False
1
2
3
4
my_list = [1,2,3,4]
1 in my_list
[1,2] in my_list
[1] in my_list
False
1
2
3
my_letters = "abc"
list(my_letters)==my_list
## List is not String
False
1
type(my_letters)
str
1
type(list(my_letters))
list
1
2
"".join(list(my_letters))
# 不同于 str("".join(list(my_letters)))
'abc'
1
2
"-".join(list(my_letters))
## The list can not be numbers like [1,2,3]
'a-b-c'

How to change some letters in the String?

1
2
3
4
5
# Error: my_letters[0]=z
# Correct:
my_list = list(my_letters)
my_list[0]="z"
my_list
['z', 'b', 'c']
1
2
fruits = ["apple","banana","orange"]
list(enumerate(fruits))
[(0, 'apple'), (1, 'banana'), (2, 'orange')]
1
list(enumerate(fruits, start=1)) 
[(1, 'apple'), (2, 'banana'), (3, 'orange')]
1
2
3
for i,f in enumerate(fruits):
fruits[i]=f.upper()
fruits
['APPLE', 'BANANA', 'ORANGE']

List

  • list are not iterators, use iter() -next() = iter.__next__()
  • for loop 就是这种工作方式
1
numbers = list(range(1,11))
1
range(1,11)
range(1, 11)
1
numbers
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
1
2
def square(x):
return x**2
1
list(map(lambda x: x**2 if(x%2==0) else None, numbers))
[None, 4, None, 16, None, 36, None, 64, None, 100]

map function

1
map(square,numbers)
<map at 0x1ce204a6950>
1
list(map(square,numbers))
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

Anonymous Function:

1
list(map(lambda x: x**2, numbers))
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
1
[x**2 for x in numbers]
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
1
[x for x in numbers if x%2==0]
[2, 4, 6, 8, 10]

Dicts and tuples

1
my_large_list = list(range(10_000_000))
1
2
my_large_dict = {v: v for v in my_large_list}
my_large_dict
{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110,
 111: 111,
 112: 112,
 113: 113,
 114: 114,
 115: 115,
 116: 116,
 117: 117,
 118: 118,
 119: 119,
 120: 120,
 121: 121,
 122: 122,
 123: 123,
 124: 124,
 125: 125,
 126: 126,
 127: 127,
 128: 128,
 129: 129,
 130: 130,
 131: 131,
 132: 132,
 133: 133,
 134: 134,
 135: 135,
 136: 136,
 137: 137,
 138: 138,
 139: 139,
 140: 140,
 141: 141,
 142: 142,
 143: 143,
 144: 144,
 145: 145,
 146: 146,
 147: 147,
 148: 148,
 149: 149,
 150: 150,
 151: 151,
 152: 152,
 153: 153,
 154: 154,
 155: 155,
 156: 156,
 157: 157,
 158: 158,
 159: 159,
 160: 160,
 161: 161,
 162: 162,
 163: 163,
 164: 164,
 165: 165,
 166: 166,
 167: 167,
 168: 168,
 169: 169,
 170: 170,
 171: 171,
 172: 172,
 173: 173,
 174: 174,
 175: 175,
 176: 176,
 177: 177,
 178: 178,
 179: 179,
 180: 180,
 181: 181,
 182: 182,
 183: 183,
 184: 184,
 185: 185,
 186: 186,
 187: 187,
 188: 188,
 189: 189,
 190: 190,
 191: 191,
 192: 192,
 193: 193,
 194: 194,
 195: 195,
 196: 196,
 197: 197,
 198: 198,
 199: 199,
 200: 200,
 201: 201,
 202: 202,
 203: 203,
 204: 204,
 205: 205,
 206: 206,
 207: 207,
 208: 208,
 209: 209,
 210: 210,
 211: 211,
 212: 212,
 213: 213,
 214: 214,
 215: 215,
 216: 216,
 217: 217,
 218: 218,
 219: 219,
 220: 220,
 221: 221,
 222: 222,
 223: 223,
 224: 224,
 225: 225,
 226: 226,
 227: 227,
 228: 228,
 229: 229,
 230: 230,
 231: 231,
 232: 232,
 233: 233,
 234: 234,
 235: 235,
 236: 236,
 237: 237,
 238: 238,
 239: 239,
 240: 240,
 241: 241,
 242: 242,
 243: 243,
 244: 244,
 245: 245,
 246: 246,
 247: 247,
 248: 248,
 249: 249,
 250: 250,
 251: 251,
 252: 252,
 253: 253,
 254: 254,
 255: 255,
 256: 256,
 257: 257,
 258: 258,
 259: 259,
 260: 260,
 261: 261,
 262: 262,
 263: 263,
 264: 264,
 265: 265,
 266: 266,
 267: 267,
 268: 268,
 269: 269,
 270: 270,
 271: 271,
 272: 272,
 273: 273,
 274: 274,
 275: 275,
 276: 276,
 277: 277,
 278: 278,
 279: 279,
 280: 280,
 281: 281,
 282: 282,
 283: 283,
 284: 284,
 285: 285,
 286: 286,
 287: 287,
 288: 288,
 289: 289,
 290: 290,
 291: 291,
 292: 292,
 293: 293,
 294: 294,
 295: 295,
 296: 296,
 297: 297,
 298: 298,
 299: 299,
 300: 300,
 301: 301,
 302: 302,
 303: 303,
 304: 304,
 305: 305,
 306: 306,
 307: 307,
 308: 308,
 309: 309,
 310: 310,
 311: 311,
 312: 312,
 313: 313,
 314: 314,
 315: 315,
 316: 316,
 317: 317,
 318: 318,
 319: 319,
 320: 320,
 321: 321,
 322: 322,
 323: 323,
 324: 324,
 325: 325,
 326: 326,
 327: 327,
 328: 328,
 329: 329,
 330: 330,
 331: 331,
 332: 332,
 333: 333,
 334: 334,
 335: 335,
 336: 336,
 337: 337,
 338: 338,
 339: 339,
 340: 340,
 341: 341,
 342: 342,
 343: 343,
 344: 344,
 345: 345,
 346: 346,
 347: 347,
 348: 348,
 349: 349,
 350: 350,
 351: 351,
 352: 352,
 353: 353,
 354: 354,
 355: 355,
 356: 356,
 357: 357,
 358: 358,
 359: 359,
 360: 360,
 361: 361,
 362: 362,
 363: 363,
 364: 364,
 365: 365,
 366: 366,
 367: 367,
 368: 368,
 369: 369,
 370: 370,
 371: 371,
 372: 372,
 373: 373,
 374: 374,
 375: 375,
 376: 376,
 377: 377,
 378: 378,
 379: 379,
 380: 380,
 381: 381,
 382: 382,
 383: 383,
 384: 384,
 385: 385,
 386: 386,
 387: 387,
 388: 388,
 389: 389,
 390: 390,
 391: 391,
 392: 392,
 393: 393,
 394: 394,
 395: 395,
 396: 396,
 397: 397,
 398: 398,
 399: 399,
 400: 400,
 401: 401,
 402: 402,
 403: 403,
 404: 404,
 405: 405,
 406: 406,
 407: 407,
 408: 408,
 409: 409,
 410: 410,
 411: 411,
 412: 412,
 413: 413,
 414: 414,
 415: 415,
 416: 416,
 417: 417,
 418: 418,
 419: 419,
 420: 420,
 421: 421,
 422: 422,
 423: 423,
 424: 424,
 425: 425,
 426: 426,
 427: 427,
 428: 428,
 429: 429,
 430: 430,
 431: 431,
 432: 432,
 433: 433,
 434: 434,
 435: 435,
 436: 436,
 437: 437,
 438: 438,
 439: 439,
 440: 440,
 441: 441,
 442: 442,
 443: 443,
 444: 444,
 445: 445,
 446: 446,
 447: 447,
 448: 448,
 449: 449,
 450: 450,
 451: 451,
 452: 452,
 453: 453,
 454: 454,
 455: 455,
 456: 456,
 457: 457,
 458: 458,
 459: 459,
 460: 460,
 461: 461,
 462: 462,
 463: 463,
 464: 464,
 465: 465,
 466: 466,
 467: 467,
 468: 468,
 469: 469,
 470: 470,
 471: 471,
 472: 472,
 473: 473,
 474: 474,
 475: 475,
 476: 476,
 477: 477,
 478: 478,
 479: 479,
 480: 480,
 481: 481,
 482: 482,
 483: 483,
 484: 484,
 485: 485,
 486: 486,
 487: 487,
 488: 488,
 489: 489,
 490: 490,
 491: 491,
 492: 492,
 493: 493,
 494: 494,
 495: 495,
 496: 496,
 497: 497,
 498: 498,
 499: 499,
 500: 500,
 501: 501,
 502: 502,
 503: 503,
 504: 504,
 505: 505,
 506: 506,
 507: 507,
 508: 508,
 509: 509,
 510: 510,
 511: 511,
 512: 512,
 513: 513,
 514: 514,
 515: 515,
 516: 516,
 517: 517,
 518: 518,
 519: 519,
 520: 520,
 521: 521,
 522: 522,
 523: 523,
 524: 524,
 525: 525,
 526: 526,
 527: 527,
 528: 528,
 529: 529,
 530: 530,
 531: 531,
 532: 532,
 533: 533,
 534: 534,
 535: 535,
 536: 536,
 537: 537,
 538: 538,
 539: 539,
 540: 540,
 541: 541,
 542: 542,
 543: 543,
 544: 544,
 545: 545,
 546: 546,
 547: 547,
 548: 548,
 549: 549,
 550: 550,
 551: 551,
 552: 552,
 553: 553,
 554: 554,
 555: 555,
 556: 556,
 557: 557,
 558: 558,
 559: 559,
 560: 560,
 561: 561,
 562: 562,
 563: 563,
 564: 564,
 565: 565,
 566: 566,
 567: 567,
 568: 568,
 569: 569,
 570: 570,
 571: 571,
 572: 572,
 573: 573,
 574: 574,
 575: 575,
 576: 576,
 577: 577,
 578: 578,
 579: 579,
 580: 580,
 581: 581,
 582: 582,
 583: 583,
 584: 584,
 585: 585,
 586: 586,
 587: 587,
 588: 588,
 589: 589,
 590: 590,
 591: 591,
 592: 592,
 593: 593,
 594: 594,
 595: 595,
 596: 596,
 597: 597,
 598: 598,
 599: 599,
 600: 600,
 601: 601,
 602: 602,
 603: 603,
 604: 604,
 605: 605,
 606: 606,
 607: 607,
 608: 608,
 609: 609,
 610: 610,
 611: 611,
 612: 612,
 613: 613,
 614: 614,
 615: 615,
 616: 616,
 617: 617,
 618: 618,
 619: 619,
 620: 620,
 621: 621,
 622: 622,
 623: 623,
 624: 624,
 625: 625,
 626: 626,
 627: 627,
 628: 628,
 629: 629,
 630: 630,
 631: 631,
 632: 632,
 633: 633,
 634: 634,
 635: 635,
 636: 636,
 637: 637,
 638: 638,
 639: 639,
 640: 640,
 641: 641,
 642: 642,
 643: 643,
 644: 644,
 645: 645,
 646: 646,
 647: 647,
 648: 648,
 649: 649,
 650: 650,
 651: 651,
 652: 652,
 653: 653,
 654: 654,
 655: 655,
 656: 656,
 657: 657,
 658: 658,
 659: 659,
 660: 660,
 661: 661,
 662: 662,
 663: 663,
 664: 664,
 665: 665,
 666: 666,
 667: 667,
 668: 668,
 669: 669,
 670: 670,
 671: 671,
 672: 672,
 673: 673,
 674: 674,
 675: 675,
 676: 676,
 677: 677,
 678: 678,
 679: 679,
 680: 680,
 681: 681,
 682: 682,
 683: 683,
 684: 684,
 685: 685,
 686: 686,
 687: 687,
 688: 688,
 689: 689,
 690: 690,
 691: 691,
 692: 692,
 693: 693,
 694: 694,
 695: 695,
 696: 696,
 697: 697,
 698: 698,
 699: 699,
 700: 700,
 701: 701,
 702: 702,
 703: 703,
 704: 704,
 705: 705,
 706: 706,
 707: 707,
 708: 708,
 709: 709,
 710: 710,
 711: 711,
 712: 712,
 713: 713,
 714: 714,
 715: 715,
 716: 716,
 717: 717,
 718: 718,
 719: 719,
 720: 720,
 721: 721,
 722: 722,
 723: 723,
 724: 724,
 725: 725,
 726: 726,
 727: 727,
 728: 728,
 729: 729,
 730: 730,
 731: 731,
 732: 732,
 733: 733,
 734: 734,
 735: 735,
 736: 736,
 737: 737,
 738: 738,
 739: 739,
 740: 740,
 741: 741,
 742: 742,
 743: 743,
 744: 744,
 745: 745,
 746: 746,
 747: 747,
 748: 748,
 749: 749,
 750: 750,
 751: 751,
 752: 752,
 753: 753,
 754: 754,
 755: 755,
 756: 756,
 757: 757,
 758: 758,
 759: 759,
 760: 760,
 761: 761,
 762: 762,
 763: 763,
 764: 764,
 765: 765,
 766: 766,
 767: 767,
 768: 768,
 769: 769,
 770: 770,
 771: 771,
 772: 772,
 773: 773,
 774: 774,
 775: 775,
 776: 776,
 777: 777,
 778: 778,
 779: 779,
 780: 780,
 781: 781,
 782: 782,
 783: 783,
 784: 784,
 785: 785,
 786: 786,
 787: 787,
 788: 788,
 789: 789,
 790: 790,
 791: 791,
 792: 792,
 793: 793,
 794: 794,
 795: 795,
 796: 796,
 797: 797,
 798: 798,
 799: 799,
 800: 800,
 801: 801,
 802: 802,
 803: 803,
 804: 804,
 805: 805,
 806: 806,
 807: 807,
 808: 808,
 809: 809,
 810: 810,
 811: 811,
 812: 812,
 813: 813,
 814: 814,
 815: 815,
 816: 816,
 817: 817,
 818: 818,
 819: 819,
 820: 820,
 821: 821,
 822: 822,
 823: 823,
 824: 824,
 825: 825,
 826: 826,
 827: 827,
 828: 828,
 829: 829,
 830: 830,
 831: 831,
 832: 832,
 833: 833,
 834: 834,
 835: 835,
 836: 836,
 837: 837,
 838: 838,
 839: 839,
 840: 840,
 841: 841,
 842: 842,
 843: 843,
 844: 844,
 845: 845,
 846: 846,
 847: 847,
 848: 848,
 849: 849,
 850: 850,
 851: 851,
 852: 852,
 853: 853,
 854: 854,
 855: 855,
 856: 856,
 857: 857,
 858: 858,
 859: 859,
 860: 860,
 861: 861,
 862: 862,
 863: 863,
 864: 864,
 865: 865,
 866: 866,
 867: 867,
 868: 868,
 869: 869,
 870: 870,
 871: 871,
 872: 872,
 873: 873,
 874: 874,
 875: 875,
 876: 876,
 877: 877,
 878: 878,
 879: 879,
 880: 880,
 881: 881,
 882: 882,
 883: 883,
 884: 884,
 885: 885,
 886: 886,
 887: 887,
 888: 888,
 889: 889,
 890: 890,
 891: 891,
 892: 892,
 893: 893,
 894: 894,
 895: 895,
 896: 896,
 897: 897,
 898: 898,
 899: 899,
 900: 900,
 901: 901,
 902: 902,
 903: 903,
 904: 904,
 905: 905,
 906: 906,
 907: 907,
 908: 908,
 909: 909,
 910: 910,
 911: 911,
 912: 912,
 913: 913,
 914: 914,
 915: 915,
 916: 916,
 917: 917,
 918: 918,
 919: 919,
 920: 920,
 921: 921,
 922: 922,
 923: 923,
 924: 924,
 925: 925,
 926: 926,
 927: 927,
 928: 928,
 929: 929,
 930: 930,
 931: 931,
 932: 932,
 933: 933,
 934: 934,
 935: 935,
 936: 936,
 937: 937,
 938: 938,
 939: 939,
 940: 940,
 941: 941,
 942: 942,
 943: 943,
 944: 944,
 945: 945,
 946: 946,
 947: 947,
 948: 948,
 949: 949,
 950: 950,
 951: 951,
 952: 952,
 953: 953,
 954: 954,
 955: 955,
 956: 956,
 957: 957,
 958: 958,
 959: 959,
 960: 960,
 961: 961,
 962: 962,
 963: 963,
 964: 964,
 965: 965,
 966: 966,
 967: 967,
 968: 968,
 969: 969,
 970: 970,
 971: 971,
 972: 972,
 973: 973,
 974: 974,
 975: 975,
 976: 976,
 977: 977,
 978: 978,
 979: 979,
 980: 980,
 981: 981,
 982: 982,
 983: 983,
 984: 984,
 985: 985,
 986: 986,
 987: 987,
 988: 988,
 989: 989,
 990: 990,
 991: 991,
 992: 992,
 993: 993,
 994: 994,
 995: 995,
 996: 996,
 997: 997,
 998: 998,
 999: 999,
 ...}

hash() function

1
2
# Get hash of the object
hash(10_000_000)
10000000

Operation time of the loops

1
2
%%timeit
10_000_000 in my_large_dict
327 ns ± 141 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

Dict

1
d = {1:'jonathan',2:'julian'}
1
list(d)
[1, 2]
1
2
for key,value in d.items():
print(d[key])
jonathan
julian
1
list(d.items())
[(1, 'jonathan'), (2, 'julian')]
1
len(d)
2
1
d[1]
'jonathan'
1
2
my_list = [1,2,3]
a,b,c = my_list
1
a,b = (b,a)
1
b
1

There's no difference between these two expressions.

1
2
3
a,b = b,a
a

1
1
2
3
4
5
from functools import reduce
seq = [0,1,2,3,4,5]
add = lambda x, y: x+y
seq_avg = reduce(add,seq)/len(seq)
seq_avg
2.5

Operation time of dict is very slow than list But faster for lookup

1
phon = {x: x for x in range(100_000_000)}
1
2
3
def min_coins(c, S):
if c<0:
return float

Brief of if and else

result_true if cond(A) else result_false

Application of dict, list,tuple

List

1
my_list = list(range(3))
1
my_list[0], my_list[-1], my_list[:2], my_list[-2:]
(0, 2, [0, 1], [1, 2])
1
2
my_list.append(3)
my_list
[0, 1, 2, 3]
1
my_list+my_list
[0, 1, 2, 3, 0, 1, 2, 3]
1
my_list*3
[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]
1
4 in my_list # expensive:searching over whole list
False

my_list = [func(x) for x in interable if cond(x)]

1
[x for x in range(10) if x % 3 ==0]
[0, 3, 6, 9]
1
range(0,10_000_000_000)
range(0, 10000000000)

If you use list(range(10_000_000_000)), it will crush your computer. For list has much longer operation time. And it will iterate the expression. Thus, list() must have iterable arguments. Check hlep() of list.

Exercise: is_prime

1
2
3
4
5
6
def is_prime(n):
for k in range(2,n):
if n % k == 0:
return False
return True
is_prime(4)
False

all() function

1
2
3
def is_prime(n):
return all(n%k !=0 for k in range(2,n))
is_prime(5)
True
1
len([n for n in range(2,101) if is_prime(n)])
25

Exercise mean()

1
2
3
4
5
6
def mean(x: list[float]) -> float:
'return the mean of the list of numbers x'
return sum(x)/len(x)

# Error: mean("this is not a float")
mean([1,2,3,4])
2.5
1
2
3
4
5
6
7
8
9
10
11
12
13
def median(x: list[float]) -> float:
'return a median'
x_sorted = sorted(x)
n = len(x)
if n%2 ==1:
return x_sorted[(n-1)//2]
else:
return (x_sorted[n//2-1] + x_sorted[n//2])/2


x = [0,1,2,3,4,5]
median(x)

2.5

Sorted List sort() and sorted()

1
2
3
x = [2,3,4,1,0]
x.sort()
x
[0, 1, 2, 3, 4]
1
2
3
x_sorted = x.sort()
x_sorted
type(x_sorted)
NoneType
1
2
x_sorted = sorted(x)
x_sorted
[0, 1, 2, 3, 4]

If you want to get integer

The index of the list must be integer, sometimes it might be wrong using x/2

1
4//2
2
1
4//1.0
4.0
1
5//3
1
1
4/2
2.0

all() function

1
2
def is_prime(n):
return all(k%2 !=0 for k in range(2,n))

return true if all the expression is true

1
2
3
def mean(x: list[float]) -> float:
return sum(x)/len(x)

If the type is not right, it will return wrong. And make the program easier to understand

Dict

  • Used to store key-value pairs
  • Fast lookup
  • "Gereralized lists"
  • Keys must be hashable; values can be anything
1
2
my_dict = {"cat": ["kitten","Gaff"], "dog": "puppy", "chicken": "chick"}
my_dict["cat"]
['kitten', 'Gaff']
1
my_dict["cat"][0]
'kitten'
1
2
# iterators to keys, values, and key/value pairs
list(my_dict.keys())
['cat', 'dog', 'chicken']
1
2
# If no "list", the type is different
my_dict.keys()
dict_keys(['cat', 'dog', 'chicken'])
1
list(my_dict.values())
[['kitten', 'Gaff'], 'puppy', 'chick']
1
2
3
4
# return As Tuple
list(my_dict.items())
# A method for loops:
# for k,v in d.items():
[('cat', ['kitten', 'Gaff']), ('dog', 'puppy'), ('chicken', 'chick')]
1
2
# return a default value if missing
my_dict.get("penguin", None)
1
2
# return a default value if missing
my_dict.get("penguin", "not found")
'not found'
1
my_dict.get("cat", "not found")
['kitten', 'Gaff']
1
2
# add two dicts together; requires Python 3.9+
my_dict | {"horse": "foal"}
{'cat': ['kitten', 'Gaff'],
 'dog': 'puppy',
 'chicken': 'chick',
 'horse': 'foal'}

Exercise mode()

1
[0,1,1,2].count(1)
2
1
2
3
4
5
6
7
8
9
10
11
12
# This operation is expensive: O(n)
def mode(x: list[float]) -> float:
highest_count = -1
highest_value = None
for y in x:
if x.count(y) > highest_count:
highest_count = x.count(y)
highest_value = y
return highest_value

mode([0,1,1,2])

1
1
2
3
4
5
6
7
8
9
10
11
def mode(x: list[float]) -> float:
d = {}
for y in x:
# if y not in d:
# d[y]=0
d.setdefault(y,0) # if None, set default num = 0
d[y]+=1
return d

mode([0,1,1,2])

{0: 1, 1: 2, 2: 1}

max()

1
max([0,1,23,3])
23

count() list

1
2
# count the object
my_list.count(1)
1

setdefault()

1
2
3
4
5
d={}
y=0
d.setdefault(y,0) # 如果是空集,就赋值一个默认值
d[y]=d[y]+1
d
{0: 1}

Collections

1
2
from collections import Counter
Counter([0,1,1,2])
Counter({0: 1, 1: 2, 2: 1})
1
Counter([0,1,1,2])[0]
1
1
2
# return the first n most common entries
Counter([0,1,1,2,2,2]).most_common(1)
[(2, 3)]
1
2
3
4
def mode(x: list[float]) -> float:
return Counter(x).most_common(1)[0][0]

mode([0,1,1,2,2,2])
2

Tuple

1
2
# Compare the first element
(0,1) > (1,2)
False
1
2
# Compare the first element
max((0,3),(1,2))
(1, 2)

Exercise:Harry Potter

1
2
hp1 = list(open("2/hp1.txt","rt"))
hp1[:5]
["Harry Potter and the Sorcerer's Stone\n", '\n', '\n', 'CHAPTER ONE\n', '\n']
1
len(hp1)
10702
1
hp1[0].split(" ")
['Harry', 'Potter', 'and', 'the', "Sorcerer's", 'Stone\n']
1
hp1[0].strip().split(" ")
['Harry', 'Potter', 'and', 'the', "Sorcerer's", 'Stone']
1
2
3
4
5
all_word = []
for line in hp1:
for word in line.strip().split(" "):
all_word.append(word)

1
2
3
4
all_word = [word 
for line in hp1
for word in line.strip().split(" ")
]
1
all_word[5:10]
['Stone', '', '', 'CHAPTER', 'ONE']
1
2
3
4
5
6
7
# 参见String部分
# "" 返回 False
all_word = [word
for line in hp1
for word in line.strip().split(" ")
if word
]
1
all_word[5:10]
['Stone', 'CHAPTER', 'ONE', 'THE', 'BOY']
1
c = Counter(all_word)
1
c.most_common(5)
[('the', 3306), ('to', 1827), ('and', 1787), ('a', 1577), ('of', 1235)]
1
stopwords = [line.strip() for line in open("2/stopwords.txt","rt")]
1
uncommon_words = [word for word in all_word if word not in stopwords]
1
Counter(uncommon_words).most_common(5)
[('Harry', 903), ('--', 688), ('said', 659), ('I', 499), ('He', 468)]
1
caps = [word for word in all_word if word[0].upper()==word[0] and len(word)>5]
1
2
Counter(caps).most_common(5)

[('Hagrid', 196),
 ('Hermione', 175),
 ('Professor', 161),
 ('Harry,', 134),
 ('Harry.', 115)]

Iterators

Iterable is python 的精髓

Iterator:

  • Call next()
  • IF no more values Next() raise stopiteration(是exception的一种)
  • 可以infinite,但是list不可以
1
2
3
4
5
6
7
8
9
10
11
12
13
"""
a = [1,2]
next(a)
next(a)
next(a)
>>> List is not an iterator
"""
# Iterate something
a = [1,2]
a = iter(a)
next(a)
next(a)
next(a)
---------------------------------------------------------------------------

StopIteration                             Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 134' in <module>
     <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000133?line=10'>11</a> next(a)
     <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000133?line=11'>12</a> next(a)
---> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000133?line=12'>13</a> next(a)


StopIteration: 
1
2
3
4
5
"""
len(a)
>>> TypeError: object of type 'list_iterator' has no len()
"""
len(a)
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 135' in <module>
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=0'>1</a> """
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=1'>2</a> len(a)
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=2'>3</a> >>> TypeError: object of type 'list_iterator' has no len()
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=3'>4</a> """
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=4'>5</a> len(a)


TypeError: object of type 'list_iterator' has no len()

Iterator and Generator can hold large dataset, not off memory. Conceptually represent large dataset.

Iterable is something I can call the function iter() on. Eg: list is an iterable.

All the container are iterable. Eg: dicts

1
2
3
4
5
d = {1:"a",2:"b"}
e = iter(d)
# return the key of dict
next(e)

1
1
2
3
4
5
6
it = iter(iterable)
while true:
try:
a =next(t)
except SropIteration:
break

Itertools

  • Group By
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import itertools


a_list = [("Animal", "cat"),
("Animal", "dog"),
("Bird", "peacock"),
("Bird", "pigeon")]


an_iterator = itertools.groupby(a_list, lambda x : x[0])

for key, group in an_iterator:
key_and_group = {key : list(group)}
print(key_and_group)
{'Animal': [('Animal', 'cat'), ('Animal', 'dog')]}
{'Bird': [('Bird', 'peacock'), ('Bird', 'pigeon')]}

it.product 将两个list一一对应形成一系列tuple

1
2
3
4
5
6
7
import itertools as it
suits = "♦♣♥♠"
ranks = [str(x) for x in range(2, 11)] + list("JQKA")


cards = it.product(ranks, suits)
next(cards)
('2', '♦')

iter.product 不能shuffle

Error: object of type "itertools.product" has no len()

1
2
import random
random.shuffle(cards)
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 143' in <module>
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000142?line=0'>1</a> import random
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000142?line=1'>2</a> random.shuffle(cards)


File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0\lib\random.py:391, in Random.shuffle(self, x, random)
    <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=388'>389</a> if random is None:
    <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=389'>390</a>     randbelow = self._randbelow
--> <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=390'>391</a>     for i in reversed(range(1, len(x))):
    <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=391'>392</a>         # pick an element in x[:i+1] with which to exchange x[i]
    <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=392'>393</a>         j = randbelow(i + 1)
    <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=393'>394</a>         x[i], x[j] = x[j], x[i]


TypeError: object of type 'itertools.product' has no len()
1
2
3
list(cards)
# list(cards)
# 第二次list(iterator)返回空集,因为iterator已经遍历完了
[('2', '♣'),
 ('2', '♥'),
 ('2', '♠'),
 ('3', '♦'),
 ('3', '♣'),
 ('3', '♥'),
 ('3', '♠'),
 ('4', '♦'),
 ('4', '♣'),
 ('4', '♥'),
 ('4', '♠'),
 ('5', '♦'),
 ('5', '♣'),
 ('5', '♥'),
 ('5', '♠'),
 ('6', '♦'),
 ('6', '♣'),
 ('6', '♥'),
 ('6', '♠'),
 ('7', '♦'),
 ('7', '♣'),
 ('7', '♥'),
 ('7', '♠'),
 ('8', '♦'),
 ('8', '♣'),
 ('8', '♥'),
 ('8', '♠'),
 ('9', '♦'),
 ('9', '♣'),
 ('9', '♥'),
 ('9', '♠'),
 ('10', '♦'),
 ('10', '♣'),
 ('10', '♥'),
 ('10', '♠'),
 ('J', '♦'),
 ('J', '♣'),
 ('J', '♥'),
 ('J', '♠'),
 ('Q', '♦'),
 ('Q', '♣'),
 ('Q', '♥'),
 ('Q', '♠'),
 ('K', '♦'),
 ('K', '♣'),
 ('K', '♥'),
 ('K', '♠'),
 ('A', '♦'),
 ('A', '♣'),
 ('A', '♥'),
 ('A', '♠')]

随机list

  • Only List
1
2
3
4
import random
a = list(range(5))
random.shuffle(a)
a
[2, 1, 4, 3, 0]
1
list(cards)
[]
1
list(cards)
[]

Only can list once. 第二次为空,除非重新iter.product

iterable can be infinite

1
2
import itertools
natural = itertools.count()
1
2
3
# Never stop
next(natural)
# No `list(natural)` : because it is infinite
0

itertools.islice

  • Can not slice iterable, unless use islice
1
2
3
4
5
6
7
8
import itertools
natural = itertools.count()
# 0-37
naturalsTo38 = itertools.islice(natural,38)
list(naturalsTo38)
# 76, 因为natural是一个iterator,上一句在37结束,下一句从38开始做第一个,然后从38个以后也就是76开始
naturals38 = itertools.islice(natural,38,None)
next(naturals38)
76

Generators

  • is a type of iterator
  • yield: reserved word
    • 直到最后一个之后,return StopIteration
    • 能够从上次结束的位置开始,无需重新循环一遍
    • 有点类似return,按顺序每次返回一个数,返回的type是generator

yield

1
2
3
def one():
yield 1
one()
<generator object one at 0x000001CE21C93D80>
1
2
my_first_iter = one()

1
2
next(my_first_iter)

1
1
2
3
4
def ten():
for i in range(10):
yield i**2
my_2nd_iter = ten()
1
next(my_2nd_iter)
0
1
2
3
4
5
6
7
8
9
10
def gen():
yield 1
yield 2
yield 3
yield 4
yield 5
for i in range(10):
yield i
g = gen()
list(g)
[1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Generate the perfect numbers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def divisors(n):
return [i for i in range(1,n) if n%i==0]
def is_perfect(i):
return sum(divisors(i))==i

def perfect_numbers():
i = 0
while True:
if is_perfect(i):
yield i
i+=1

perfects = perfect_numbers()
# 这里不能直接next(perfect_numbers()), 否则一直是第一个数
1
next(perfects)
0

Exceptions

  • exception is object
  • https://docs.python.org/3/library/exceptions.html
  • 常用:ValueError, TypeError, RuntimeError
1
2
3
4
if can_do_thing():
do_thing()
else:
handle_error()
1
2
3
4
try:
do_thing()
except NoCanDo:
handle_error()

Never:

1
2
3
4
try:
do_thing()
except:
handle_error()

Never:

1
2
3
4
try:
do_thing()
except:
pass
1
2
3
4
5
d = {1:2}
if 1 in d:
print(d[1])
else:
print("1 not in")
2
1
2
3
4
try:
print(d[1])
except KeyError:
print("key not in via excepttion")
2

Files

  • open(filename,mode) default: read only
    • reading,writing,or appending
    • text or binary(.dat)
1
2
for line in open("readme.txt"):
print(line)
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 171' in <module>
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000170?line=0'>1</a> for line in open("readme.txt"):
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000170?line=1'>2</a>     print(line)


FileNotFoundError: [Errno 2] No such file or directory: 'readme.txt'

Formatting strings

  • % (old)
  • string.format()
  • f_strings(newest)
1
2
3
4
5
6
7
8
9
10
11
12
topping = "tomato"
"my fav pizza is %s" % topping
"my fav pizza is {}".format(topping)
"my fav pizza is {topping}".format(topping=topping)
f"my fav pizza is {topping}"

## Exercise: Weather in Ann Arbor



```python
!head 3/a2temps.dat
'head' �����ڲ����ⲿ���Ҳ���ǿ����еij���
���������ļ���

*variate可以用作多个data一次性赋值list:packing

1
2
3
4
5
6
import itertools
with open('3/a2temps.dat') as f:
tbl1 = list(itertools.islice(f,2,2+3))
month,*data = tbl1[0].strip().split(",")
data[:5]

['206', '267', '261', '228', '139']
1
2
month,*data = map(int,tbl1[0].strip().split(","))
data[:5]
[206, 267, 261, 228, 139]
1
2
3
4
5
6
def parse_table(txt):
d = {}
for line in txt:
month,*data = map(int,line.strip().split(","))
d[month] = data
return d

repr() is what python actually sees

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import itertools
a2temps = {}
with open('3/a2temps.dat') as f:
lines = []
current_year = 1891
for line in f:
line = line.strip()
if len(line) == 4:
a2temps[current_year] = parse_table(lines)
lines = []
current_year = int(line)
elif len(line) > 4:
lines.append(line)
# a2temps[year][month][day]
a2temps[1893][1][:5]

[-6, 0, -83, -94, -72]

Generator

  • expression like list, but create iterator
1
2
3
s = Squares() # infinite
# Error: [x**2 for x in s]
sggen = (x**2 for x in s)
1
2
3
4
5
6
7
8
def even():
i = 0
while True:
yield i
i+=2

e = even()
next(e),next(e),next(e)
(0, 2, 4)
1
2
gen = (e**2 for e in even())
gen
<generator object <genexpr> at 0x000001CEF3932CE0>
1
next(gen),next(gen),next(gen)
(0, 4, 16)
1
2
3
4
r = range(10)
sum(
(rr**2 for rr in r)
)
285

lambda expressions

  • define functions without def
1
2
f = lambda x: x**2
f(10)
100

any() and all()

  • argument must be iterator
  • any() return True if and only if one or more is True:
  • 0,0.0,empty string, empty list, is False
  • all() return True if all is true
1
any((0,0.0,""))
False

Reading csv and txt

open() is an iterable

1
2
3
4
f = open('4/example2.txt')
for line in f:
print(line.strip().split(","))

['1', '2', '3', '4']
['5', '6', '7', '8']
['9', '10', '11', '12']

Reading csv with a header

1
print(open("4/example3.txt").read())
col1,col2,col3,col4
1,2,3,4
5,6,7,8
9,10,11,12

1
2
3
4
5
6
f = open('4/example3.txt')
header = next(f).strip().split(",")
data =[]
for line in f:
data.append(line.strip().split(","))
header,data
(['col1', 'col2', 'col3', 'col4'],
 [['1', '2', '3', '4'], ['5', '6', '7', '8'], ['9', '10', '11', '12']])

using pandas to read and write csv

1
2
import pandas as pd
pd.read_csv("4/example3.txt")
col1 col2 col3 col4
0 1 2 3 4
1 5 6 7 8
2 9 10 11 12

Reading bytes from a file

  • file.read(b) will read b bytes from a file.
1
2
f = open('4/example3.txt','rb')
b = f.read(4)
1
b,type(b)
(b'col1', bytes)
1
2
3
# Error:TypeError: can't concat str to bytes
# print(b+"2345")

1
print(b.decode("ascii")+"2345")
col12345
1
type(b.decode("ascii"))
str
1
2
3
4
5
6
7
8
9
#character
print(b.decode("ascii"))
# integer, big endian encoding
print(int.from_bytes(b,'big'))
# integer, little endian encoding
print(int.from_bytes(b,'little'))
import struct
# to Float
print(struct.unpack('f',b))
col1
1668246577
829189987
(3.4405835958040143e-09,)

Object

  • everything in python is an object
  • object-oriented programming
  • class name in CamelCase
  • Object & instance
  • attribute
    • attributeobject embedded: object have other object as their attributes.
1
2
3
4
5
6
class Point:
pass

# Instantiation: instance(object) p of the class Point
p = Point()
type(p)
__main__.Point

Constructor:

  • __开始的函数

  • __init__() is to initialize the object

  • self, refer to the object itself: DON'T FORGET!

1
2
3
4
5
6
7
8
# invisiblely : __init__: initialize the method
class Point:
def __init__(self,x,y):
print(self,x,y)
self.x =x
self.y = y
p = Point(1,1)
print(p.x,p.y)
<__main__.Point object at 0x000001CF0323E8F0> 1 1
1 1
1
2
3
4
5
6
7
8
9
10
11
import math
class Point:
def __init__(self,x,y):
self.x =x
self.y = y
def norm(self):
# DON'T FORGET: self
return math.sqrt(self.x**2+self.y**2)
p = Point(1,1)
p.norm()

1.4142135623730951

TicTocToe

  • Define Private Variable in Python: _variable. [Convention]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class TicTacToe:
def __init__():
self._board = [
[None,None,None],
[None,None,None],
[None,None,None]
]
self._turn = "X"

def move(self,i,j):
if self._board[i][j] is not None:
raise ValueError("square is occupied")
self._board[i][j] = self._turn
if self._turn == "X":
self._turn = "O"
else:
self._turn = "X"

def winner(board):
winning_combinations = [
[(x, y), (x + a, y + b), (x + 2 * a, y + 2 * b)]
for x in range(3)
for y in range(3)
for a in range(2)
for b in range(-1, 2)
if max(a, b) > 0
]
for w in winning_combinations:
try:
s = {board[x][y] for x, y in w}
sq = s.pop()
if len(s) == 0 and sq is not None:
return sq
except IndexError:
pass

1
!tail 4/ttt.py
'tail' �����ڲ����ⲿ���Ҳ���ǿ����еij���
���������ļ���

Dunder methods:

  • __str__(), show the object as a string
    • use str() to call this function
    • Or, when use print(), object will be showed as string format

Example: class

  • How to specify a constructor in Python:
    • __init__()
    • dunder method

The splat operator: *args: any number of arguments

  • return a tuple
1
2
3
4
def splat(*args):
print(args)

splat(1,2,"hello")
(1, 2, 'hello')
1
2
3
def splat(a,b,*args):
print(a,b,args)
splat(1,2,3,4)
1 2 (3, 4)
1
2
3
def splat(a,b):
print(a,b)
splat(a=1,b=2)
1 2
1
2
3
4
def splat(a,b,*args):
print(a,b,args)
splat(a=1,b=2)
# Error: splat(a=1,b=2,3)
1 2 ()

传递值时的不同

1
2
splat(*[1,2,3,4]) # spat(1,2,3,4)
# not = splat([1,2,3,4])
1 2 (3, 4)

**kwargs

1
2
3
4
def splat(**kwargs):
for key,value in kwargs.items():
print("{0}={1}".format(key,value))
splat(name="jun")
name=jun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
class Vector():
"A vector of real numbers"
def __init__(self,*args):
from numbers import Number

if not all(isinstance(arg,Number) for arg in args):
raise ValueError("Vector requires numerical data")


self._val = tuple(args)
v = Vector(1,2,3)
# Vector.__init__ = __init__
# Vector(1,2,3,[])

isinstance(arg,Number) check if it is a number

Vector.__init__ = __init__ 函数可以=,让两个函数相等,这里直接给class中添加后面的函数

1
2
3
4
5
def dim(self):
return len(self._val)
Vector.dim = dim
v = Vector(1,2,3)
v.dim()
3

Decorate:@property

  • 被标记的方法是属性
  • 无法随意修改属性内容
  • 调用时函数名后面可以不用加括号
1
2
3
4
5
6
7
@property
def dim(self):
return len(self._val)
Vector.dim = dim
v = Vector(1,2,3)
v.dim
# Error: v.dim()
3

Dunder Method

1
2
3
4
5
6
def __len__(self):
return len(self._val)
Vector.__len__ = __len__
v = Vector(1,2,3)
# Error: v.len()
len(v)
3

__getitem__

1
2
3
4
5
def __getitem__(self,i):
return self._val[i]

Vector.__getitem__ = __getitem__
Vector(1,2,3,4)[0]
1

Iteration: iter

1
2
3
4
5
6
7
8
def __iter__(self):
# 等价于:return iter(self._val)
for i in range(len(self)):
yield self[i]

Vector.__iter__ = __iter__
for v in Vector(1,2,3):
print(v)
1
2
3

stopiteration的条件可以在__next__里面定义,同时更新self._val:return self._val

__iter__可以直接return self

1
2
3
4
5
6
7
def __str__(self):
s = ", ".join(map(str,self))
return "(" + s + ")"

Vector.__str__ = __str__
v = Vector(1,2,3)
print(v)
(1, 2, 3)
1
2
3
4
def __repr__(self):
return "Vector"+str(self)
Vector.__repr__ = __repr__

x+y in python:

  • try x.__add__(y)
  • if fails, Python tries y.__radd__(x)
  • if fails, Python raises TypeError
1
2
3
4
5
6
7
8
9
10
11
12
def __add__(self, other):
if isinstance(other, Vector):
if self.dim == other.dim:
return Vector(*[x + y for x, y in zip(self, other)])
else:
raise ValueError("not the same dim")
else:
return NotImplemented


Vector.__add__ = __add__
Vector(1, 2, 3) + Vector(4, 5, 6)
Vector(5, 7, 9)
1
2
v = Vector(1,2,3)
next(v)
<generator object __next__ at 0x000001AE30025FC0>
1
list(zip(Vector(1,2,3),Vector(4,5,6,7,8,9)))
[(1, 4), (2, 5), (3, 6)]

Notes:

  • We used a new operator *[x + y for x, y in zip(self, other)]. This is called the "splat" operator. It takes all the items in the list (or iterable more generally) and passes them in as function arguments.
  • NotImplemented is a special Python value signifying that the operation is not implemented. Here we do not allow anything to be added to a vector except another vector.
1
2
3
4
5
6
7
8
9
10
11
12
13
def __mul__(self, other):
from numbers import Number

if isinstance(other, Number):
return Vector(*[other * x for x in self])
else:
return NotImplemented


Vector.__mul__ = __mul__
Vector(1, 2, 3) * 2
Vector.__rmul__ = __mul__
2*Vector(1,2,3)
Vector(2, 4, 6)
1
2
3
Vector.__neg__ = lambda self: -1*self
Vector.__sub__ = lambda self,other:self+(-other)
Vector(1,2,3)-Vector(1,2,3)
Vector(0, 0, 0)
1
2
3
4
5
6
def dot(self, other):
return sum(a * b for a, b in zip(self, other))


Vector.dot = dot
Vector(1, 2, 3).dot(Vector(0, 1, 2))
8

@classmethod

  • A class method is a method which is bound to the class and not the object of the class.
  • 不需要预先生成一个object
1
2
3
4
5
6
7
8
9
@classmethod
def ones(cls, d):
"return the d-dimensional vector of ones"
ones = [1] * d
return cls(*ones)


Vector.ones = ones
Vector.ones(3)
Vector(1, 1, 1)

Note: 普通方法的话还需要先创建一个instance

1
2
3
4
5
6
7
8
9
10
11
12
13
Vector.mean = lambda self: sum(self) / len(self)


def var(self):
e = Vector.ones(len(self))
s = self - self.mean() * e
return s.dot(s) / (len(self) - 1)


Vector.var = var
import math

Vector.std = lambda self: math.sqrt(self.var())
1
2
3
4
import random

X = Vector(*[random.normalvariate(mu=1, sigma=2) for _ in range(100000)])
X.mean(), X.var()
(0.9980224941670806, 4.005165548865144)

Dataframe class

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class Vector:
def __init__(self, *args):
import numbers

self._val = tuple(args)
if not all(isinstance(x, numbers.Number) for x in self):
raise ValueError("Input should be a sequence of numbers")

@property
def dim(self):
return len(self._val)

def __add__(self, other):
if not self._conformable(other):
return NotImplemented
return Vector(*[a + b for a, b in zip(self, other)])

def __mul__(self, other):
import numbers

if not isinstance(other, numbers.Number):
return NotImplemented
return Vector(*[a * other for a in self._val])

def __len__(self):
return len(self._val)

def __eq__(self, other):
if self._conformable(other):
return all(a == b for a, b in zip(self, other))
return False

def dot(self, other):
if not self._conformable(other):
raise ValueError("cannot dot product with a non-conformable vector")
return sum(a * b for a, b in zip(self, other))

def __rmul__(self, other):
return self.__mul__(other)

def __iter__(self):
return iter(self._val)

def __getitem__(self, i):
return self._val[i]

def __str__(self):
return "(" + ", ".join(str(x) for x in self._val) + ")"

def _conformable(self, other):
return isinstance(other, Vector) and self.dim == other.dim

1
2
3
4
5
my_iris = {
'sepal_length': Vector(5.1, 4.9, 4.7, 4.6, 5.0),
'sepal_width': Vector(3.5, 3.0, 3.2, 3.1, 3.6),
# ...
}
  • positional argument: don't have a name
  • keyward argument: have a name
  • 所以带变量名字的argument不能传递给positional argument
  • double splat: new operator
    • **kwargs: generate a dictionary with keywords
    • kwargs 不是唯一的,可以命名为别的
1
2
3
4
5
6
7
8
9
def double_splat(**kwargs):
print(kwargs)
# print("I got passed these args:\n" + str(args))


double_splat(kw1=1, kw2=2)
# 等价于
kwargs = {'kw1': 1, 'kw2': 2}
double_splat(**kwargs)
{'kw1': 1, 'kw2': 2}
{'kw1': 1, 'kw2': 2}

If I want to support variable numbers of both keyword and positional arguments, I can combine both forms of splat:

1
2
3
4
5
def both_splat(*args, **kwargs):
print("positional args", args)
print("kw args", kwargs)

both_splat(1, 2, kw=3)
positional args (1, 2)
kw args {'kw': 3}

Python has some restrictions on the use of splat in functions. The keyword arguments always come last:

1
2
def both_splat(**kwargs, *args):
pass
  Input In [177]
    def both_splat(**kwargs, *args):
                             ^
SyntaxError: invalid syntax

What if I want a mix of mandatory and optional arguments?

1
2
3
4
def some_optional(a, b, *args, **kwargs):
print(a, b, args, kwargs)

some_optional(1, b=2, c=3)
1 2 () {'c': 3}
1
2
3
4
class DataFrame:

def __init__(self, **kwargs):
self._data = kwargs
1
2
df = DataFrame(col=Vector(1,2))
df
<__main__.DataFrame at 0x1cf037f11e0>

Keys in dictionary:

  • hashable
  • unique
  • keys is a set
    • set is unique
1
{1,1,2,3} # set
{1, 2, 3}
1
{x+1 for x in range(10)}
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
1
2
3
a = {1,2}
b = {2,3}
a&b,a|b
({2}, {1, 2, 3})
1
2
a = frozenset([1,2,3])
b = frozenset([4,5,6])
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 268' in <module>
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000343?line=0'>1</a> a = frozenset([1,2,3])
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000343?line=1'>2</a> b = frozenset([4,5,6])
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000343?line=2'>3</a> a.add(b)


AttributeError: 'frozenset' object has no attribute 'add'
1
set([1,2,3,1])
{1, 2, 3}
  • list 不能作为key
    • unhashable
    • can be change
  • set 中不能有list
1
set([1,1,2,3,"a","b",[1,2,3]])
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 269' in <module>
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000268?line=0'>1</a> set([1,1,2,3,"a","b",[1,2,3]])


TypeError: unhashable type: 'list'
1
2
3
4
5
6
7
8
9
def __init__(self, **kwargs):
if not all(isinstance(col, Vector) for col in kwargs.values()):
raise TypeError("DataFrame columns are Vectors")
dims = {len(v) for v in kwargs.values()} # set comprehension
# set 里面的值是unique的
if len(dims) > 1:
raise ValueError("All columns must have the same dimension")

DataFrame.__init__ = __init__
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 282' in <module>
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=5'>6</a>     if len(dims) > 1:
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=6'>7</a>         raise ValueError("All columns must have the same dimension")
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=8'>9</a> DataFrame.__init__ = __init__


NameError: name 'DataFrame' is not defined

Inheritance

  • class ChildClass(ParentClass):
  • 支持Multiple Inheritance
  • 继承父类中的所有方法
1
2
class DataFrame(dict):
'a data set consisting of labeled vectors'
1
2
v1 = Vector(1, 2); v2 = Vector(3, 4); df = DataFrame(col1=v1, col2=v2)
df['col1'], len(df), list(df)
(<__main__.Vector at 0x1cf037f0e50>, 2, ['col1', 'col2'])
1
isinstance(df, dict)
True

The validation code we wrote above will not automatically apply. We need to override the default __init__ method of DataFrame with our own:

1
2
DataFrame.__init__ = __init__
DataFrame(col1=v1, col2=v2)
{}

Super constructor

  • 父类中也有一个__init__
  • 如果不加super constructor 就没有initialize the thing that hold the object
  • 只需要call it in init function
1
2
3
4
5
6
7
8
9
10
def __init__(self, **kwargs):
if not all(isinstance(col, Vector) for col in kwargs.values()):
raise TypeError("DataFrame columns are Vectors")
dims = {len(v) for v in kwargs.values()} # set comprehension
if len(dims) > 1:
raise ValueError("All columns must have the same dimension")
dict.__init__(self, **kwargs) # call superconstructor

DataFrame.__init__ = __init__
DataFrame(col1=v1, col2=v2)
{'col1': <__main__.Vector at 0x1cf037f0e50>,
 'col2': <__main__.Vector at 0x1cf037f3670>}

This __init__ function is okay, but has the deficiency that it doesn't allow us to create DataFrames in other, more flexible ways. For example, we can create a dict as follows:

1
2
kv = [(1, Vector(1)), (2, Vector(2))]  # key-value pairs
dict(kv)
{1: <__main__.Vector at 0x1cf037f22f0>, 2: <__main__.Vector at 0x1cf037f0640>}
1
2
3
# Not work for our Data Frame
DataFrame(kv)

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 281' in <module>
      <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000283?line=0'>1</a> # Not work for our Data Frame
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000283?line=1'>2</a> DataFrame(kv)


TypeError: __init__() takes 1 positional argument but 2 were given
  • More General way:__init__(self, *args, **kwargs)
1
2
3
4
5
6
7
8
9
10
11
def __init__(self, *args, **kwargs):
dict.__init__(self, *args, **kwargs) # call superconstructor
if not all(isinstance(col, Vector) for col in self.values()):
raise TypeError("DataFrame columns are Vectors")
dims = {len(v) for v in self.values()} # set comprehension
if len(dims) > 1:
raise ValueError("All columns must have the same dimension")


DataFrame.__init__ = __init__
DataFrame(col1=v1, col2=v2)
{'col1': <__main__.Vector at 0x1cf037f0e50>,
 'col2': <__main__.Vector at 0x1cf037f3670>}
1
2
# Now we can do
DataFrame(kv)
{1: <__main__.Vector at 0x1cf037f22f0>, 2: <__main__.Vector at 0x1cf037f0640>}
  • Python 中不常用继承:
    • 因为继承会继承父类所有的method, 有时候合并数据之类的会比较奇怪
1
DataFrame(col1=v1, col2=v2) | {'a': 1, 'b': 2}
{'col1': <__main__.Vector at 0x1cf037f0e50>,
 'col2': <__main__.Vector at 0x1cf037f3670>,
 'a': 1,
 'b': 2}
1
2
3
d = {1:2}
d.update(hello=3)
d
{1: 2, 'hello': 3}
1
2
3
4
df = DataFrame(col1=v1, col2=v2)
df.update(col2='uh oh')
print(type(df))
df
<class '__main__.DataFrame'>





{'col1': <__main__.Vector at 0x1cf037f0e50>, 'col2': 'uh oh'}

Overide

  • overide the original function by using the original function
1
2
3
4
5
6
7
8
9
def __getitem__(self, x):
if isinstance(x,tuple):
# unpack a tuple!!!
k,i=x
return dict.__getitem__(self,k)[i]
return dict.__getitem__(self, x)

DataFrame.__getitem__ = __getitem__
DataFrame(col1=v2, col2=v2)['col1', 0]
3

Iterating over observations

When analyzing data, it will be helpful to iterate over each observation. Something like:

1
2
3
4
5
6
>>> df = DataFrame(col1=Vector(1, 2), col2=Vector(3, 4))
>>> for x in df.obs():
>>> print(x)
>>> ...
{'col1': 1, 'col2': 3}
{'col1': 2, 'col2': 4}

To accomplish this, we'll want to zip together the vectors col1 and col2:

1
list(zip(Vector(1, 2), Vector(3, 4)))
[(1, 3), (2, 4)]

We will use this behavior to construct a second kind of iterator for our class:

1
2
3
4
def obs(self):
'Yields each observation as column-value dict'
for vals in zip(self.values()):
yield dict(zip(self, vals))

Let's try it out:

1
2
3
DataFrame.obs = obs
for ob in DataFrame(col1=v1, col2=v2, col3=v1).obs():
print(ob)
{'col1': <__main__.Vector object at 0x000001CF037F0E50>}
{'col1': <__main__.Vector object at 0x000001CF037F3670>}
{'col1': <__main__.Vector object at 0x000001CF037F0E50>}

因为问题出现在obs()中zip函数里,里面传的两个object组成的list,应该要加一个*,这样可以传递vector内部的值

1
2
3
4
5
6
7
8
9
def obs(self):
'Yields each observation as column-value dict'
for vals in zip(*self.values()):
print(list(self))
# 这里self返回的是key值
yield dict(zip(self, vals))
DataFrame.obs = obs
for ob in DataFrame(col1=v1, col2=v2, col3=v1).obs():
print(ob)
['col1', 'col2', 'col3']
{'col1': 1, 'col2': 3, 'col3': 1}
['col1', 'col2', 'col3']
{'col1': 2, 'col2': 4, 'col3': 2}

Note: 字典的list是key的list

1
2
3
a = {"a":[1,2,3],"b":[2,3,4]}
for x in a:
print(x)
a
b

Note: zip() 可以形成一个一一对应的tuple Python中很重要的概念就是iteration,要好好理解,有很多不同的用法

1
2
3
it = zip(*df.values())
vals = next(it)
list(zip(df,vals))
[('col1', 1), ('col2', 'u')]

Note: **args也可以理解为unpack every thing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 0 1 2 3 4 5 ...
# 1 2 3 4 5 6 ...
# 2 3 4 5 6 7 ...

import itertools
def windowed(iterable, k):
'(x[1], x[2], ...) -> [(x[1], ..., x[k]), (x[k], ..., x[k+1]), ...]'
# k copies of iterable
iters = itertools.tee(iterable, k)

offsetted = [itertools.islice(it, i, None) for i, it in enumerate(iters)]
return zip(*offsetted)

list(windowed(range(10), 3))
[(0, 1, 2),
 (1, 2, 3),
 (2, 3, 4),
 (3, 4, 5),
 (4, 5, 6),
 (5, 6, 7),
 (6, 7, 8),
 (7, 8, 9)]

map, filter, and reduce

  • map applies a function to every element, yield a new sequence(iterator)
  • filter remove some element(TRUE or FALSE)
1
list(map(lambda x: x**2+1,range(10)))
[1, 2, 5, 10, 17, 26, 37, 50, 65, 82]

map

  • 2 positional args
    • number of iterables must match arguments in function
1
2
3
def poly(x,y):
return (x*y)
list(map(poly,range(10),range(10,20)))
[0, 11, 24, 39, 56, 75, 96, 119, 144, 171]
1
list(map(lambda x,y: x*y,range(10),range(10,20)))
[0, 11, 24, 39, 56, 75, 96, 119, 144, 171]

filter

  • takes a Boolean function
  • returns only elements that function is TRUE
  • 2nd argument can be any iterator
    • generator
    • iterator
    • ...
1
2
3
4
def is_even(x):
return(x%2==0)
# return x%2==0
list(filter(is_even,range(10)))
[0, 2, 4, 6, 8]
1
list(filter(is_even,(x**2 for x in range(10)))) # Filtering a generator
[0, 4, 16, 36, 64]

Reduce

  • reduce an iterator to a single element
  • functools contains a bunch of useful functional programming functions, including reduce
  • functools.reduce
  • By initial an accumulator and repeatedly update the accumulator
  • If initial value is not supplied, Python will initializes the accumulator as acc=f(x,y), where x and y are the first two elements of the iterator.
  • of the iterator is length 1, it will return that element.
  • Best to specify the initial value
1
2
3
4
import functools
print(functools.reduce(lambda x,y: y/x, range(1,4),1)) # 1 is initial value
print(functools.reduce(lambda x,y: x/y, range(1,4),1)) # 1 is initial value

1.5
0.16666666666666666
1
sum([[1,2,3],[4,5],[7,8,9]],[])
[1, 2, 3, 4, 5, 7, 8, 9]

理解reduce的运算规则

1
2
3
4
5
6
7
from functools import reduce
elements = range(100)
print(reduce(lambda accum,x:accum+1,elements))
elements = reversed(range(100))
print(reduce(lambda accum,x:accum+1,elements))
elements = reversed(range(100))
print(reduce(lambda accum,x:accum+1,elements,0))
99
198
100

map is in parrallel

1
2
3
4
5
6
7
8
hp1 = open("2/hp1.txt","rt")
from collections import Counter
def mapper(line):
words = line.strip().split(" ")
return Counter(words)

result = reduce(lambda x,y: x+y,map(mapper,hp1)) # This method is really slow

1
2
3
c1 = Counter(["hello",'world'])
c2 = Counter(['hello','Jun'])
c1,c2,c1+c2
(Counter({'hello': 1, 'world': 1}),
 Counter({'hello': 1, 'Jun': 1}),
 Counter({'hello': 2, 'world': 1, 'Jun': 1}))

Enron e-mail data

1
!gzcat email-Enron.txt.gz | head
'gzcat' �����ڲ����ⲿ���Ҳ���ǿ����еij���
���������ļ���

gzip 打开gz压缩文件

1
2
3
4
5
6
7
import gzip
import itertools

def enron(limit=None):
with gzip.open("7/email-Enron.txt.gz","rt") as f:
lines = (line.strip() for line in f if not line.startswith("#"))
yield from itertools.islice(lines,limit)

Who sends the most e-mail

1
2
3
4
5
6
7
8
9
10
def mapper(line):
return Counter({int(line.split("\t")[0]): 1})


def reducer(accumulated, x):
return accumulated + x


senders = reduce(reducer, map(mapper, enron(10000)))
senders.most_common(10)
[(76, 815),
 (95, 597),
 (127, 481),
 (93, 455),
 (106, 422),
 (72, 377),
 (90, 366),
 (78, 339),
 (109, 339),
 (56, 309)]

set()

1
2
3
4
5
x = set()
x.add(1)
x.add(2)
x.add(1)
x
{1, 2}
1
2
3
4
x = set()
x.add(frozenset([1,2,3]))
x
# Error: x.add({1,2,3})
{frozenset({1, 2, 3})}

Who e-mailed the most unique people

  • set()
  • 复杂的reduce例子
1
2
3
4
5
6
7
8
9
10
11
12
def mapper(line):
return [int(x) for x in line.split("\t")]


def reducer(accumulated, x):
sender, receiver = x
accumulated.setdefault(sender, set())
accumulated[sender].add(receiver)
return accumulated


unique_recipients = reduce(reducer, map(mapper, enron(10000)), {})

This pattern emerges over and over in data analysis. So we'll stop repeating ourselves and make a function:

1
2
3
4
5
6
7
8
9
10
from functools import reduce


def map_reduce(iterable, mapper, reducer, reduce_init=None):
iter1 = map(mapper, iterable)
iter2 = reduce(reducer, iter1, reduce_init)
return list(iter2)


email_score = map_reduce(enron(10000), mapper, reducer, {})
<IPython.core.display.Javascript object>
  • create default value for dict
1
2
3
from collections import defaultdict
init = defaultdict(lambda:0)
init
defaultdict(<function __main__.<lambda>()>, {})
1
2
init[5]+=1
init
defaultdict(<function __main__.<lambda>()>, {5: 1})
1
2
3
4
from collections import defaultdict
init = defaultdict(lambda:0)
email_score = reduce(reducer,map(mapper,enron(10)),init)
email_score
defaultdict(<function __main__.<lambda>()>,
            {0: {1}, 1: {0, 2, 3, 4, 5, 6, 7, 8, 9}})

This is an example of where I would consider writing a class:

  • 只需要改变mapper和reducer,用函数overide的方式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from functools import reduce


class MapReduce:
def __init__(self, iterable):
self._iterable = iterable

def mapper(self, x):
raise NotImplementedError()

def reducer(self, accumulated, x):
raise NotImplementedError()

def run(self):
return reduce(self.reducer, map(self.mapper, self._iterable), self.reduce_init)

Here is how we would use this class:

1
2
3
4
5
6
7
8
9
10
11
12
13
class EmailScore(MapReduce):
reduce_init = {}

def mapper(self, line):
return [int(x) for x in line.strip().split("\t")]

def reducer(self, accumulated, x):
sender, receiver = x
key = tuple(sorted(x))
direction = 1 if key[0] == sender else -1
accumulated.setdefault(key, 0)
accumulated[key] += direction
return accumulated
1
email_score = EmailScore(enron(10000)).run()

Mapreduce 通用模板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from functools import reduce
import itertools
import gzip


class MapReduce:
@property
def reduce_init(self):
# override as necessary if the init parameter needs to change
return None

def mapper(self, x):
raise NotImplementedError()

def reducer(self, accum, x):
raise NotImplementedError()

def postprocess(self, reduced):
# override if necessary
return reduced

def run(self, iterable):
mapped = map(self.mapper, iterable)
reduced = reduce(self.reducer, mapped, self.reduce_init)
processed = self.postprocess(reduced)
return processed

Tricks

  • 用于有些index为负数的情况
1
-1%5
4
  • list中有if else的情况
    • and or
1
2
x=[1,2,3,4]
[i==2 and "1" or "0" for i in x]
['0', '1', '0', '0']
1
["1" if i==2 else "0"  for i in x]
['0', '1', '0', '0']
1