If you use list(range(10_000_000_000)), it will crush
your computer. For list has much longer operation time. And it will
iterate the expression. Thus, list() must have iterable
arguments. Check hlep() of list.
Exercise: is_prime
1 2 3 4 5 6
defis_prime(n): for k inrange(2,n): if n % k == 0: returnFalse returnTrue is_prime(4)
False
all() function
1 2 3
defis_prime(n): returnall(n%k !=0for k inrange(2,n)) is_prime(5)
True
1
len([n for n inrange(2,101) if is_prime(n)])
25
Exercise mean()
1 2 3 4 5 6
defmean(x: list[float]) -> float: 'return the mean of the list of numbers x' returnsum(x)/len(x)
# Error: mean("this is not a float") mean([1,2,3,4])
2.5
1 2 3 4 5 6 7 8 9 10 11 12 13
defmedian(x: list[float]) -> float: 'return a median' x_sorted = sorted(x) n = len(x) if n%2 ==1: return x_sorted[(n-1)//2] else: return (x_sorted[n//2-1] + x_sorted[n//2])/2
x = [0,1,2,3,4,5] median(x)
2.5
Sorted List sort() and
sorted()
1 2 3
x = [2,3,4,1,0] x.sort() x
[0, 1, 2, 3, 4]
1 2 3
x_sorted = x.sort() x_sorted type(x_sorted)
NoneType
1 2
x_sorted = sorted(x) x_sorted
[0, 1, 2, 3, 4]
If you want to get integer
The index of the list must be integer, sometimes it might be wrong
using x/2
1
4//2
2
1
4//1.0
4.0
1
5//3
1
1
4/2
2.0
all() function
1 2
defis_prime(n): returnall(k%2 !=0for k inrange(2,n))
# This operation is expensive: O(n) defmode(x: list[float]) -> float: highest_count = -1 highest_value = None for y in x: if x.count(y) > highest_count: highest_count = x.count(y) highest_value = y return highest_value
mode([0,1,1,2])
1
1 2 3 4 5 6 7 8 9 10 11
defmode(x: list[float]) -> float: d = {} for y in x: # if y not in d: # d[y]=0 d.setdefault(y,0) # if None, set default num = 0 d[y]+=1 return d
mode([0,1,1,2])
{0: 1, 1: 2, 2: 1}
max()
1
max([0,1,23,3])
23
count() list
1 2
# count the object my_list.count(1)
1
setdefault()
1 2 3 4 5
d={} y=0 d.setdefault(y,0) # 如果是空集,就赋值一个默认值 d[y]=d[y]+1 d
{0: 1}
Collections 类
1 2
from collections import Counter Counter([0,1,1,2])
Counter({0: 1, 1: 2, 2: 1})
1
Counter([0,1,1,2])[0]
1
1 2
# return the first n most common entries Counter([0,1,1,2,2,2]).most_common(1)
""" len(a) >>> TypeError: object of type 'list_iterator' has no len() """ len(a)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 135' in <module>
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=0'>1</a> """
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=1'>2</a> len(a)
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=2'>3</a> >>> TypeError: object of type 'list_iterator' has no len()
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=3'>4</a> """
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000134?line=4'>5</a> len(a)
TypeError: object of type 'list_iterator' has no len()
Iterator and Generator can hold large dataset, not off memory.
Conceptually represent large dataset.
Iterable is something I can call the function
iter() on. Eg: list is an iterable.
All the container are iterable. Eg: dicts
1 2 3 4 5
d = {1:"a",2:"b"} e = iter(d) # return the key of dict next(e)
1
1 2 3 4 5 6
it = iter(iterable) while true: try: a =next(t) except SropIteration: break
Itertools
Group By
1 2 3 4 5 6 7 8 9 10 11 12 13 14
import itertools a_list = [("Animal", "cat"), ("Animal", "dog"), ("Bird", "peacock"), ("Bird", "pigeon")] an_iterator = itertools.groupby(a_list, lambda x : x[0]) for key, group in an_iterator: key_and_group = {key : list(group)} print(key_and_group)
import itertools as it suits = "♦♣♥♠" ranks = [str(x) for x inrange(2, 11)] + list("JQKA")
cards = it.product(ranks, suits) next(cards)
('2', '♦')
iter.product 不能shuffle
Error: object of type "itertools.product" has no len()
1 2
import random random.shuffle(cards)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 143' in <module>
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000142?line=0'>1</a> import random
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000142?line=1'>2</a> random.shuffle(cards)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0\lib\random.py:391, in Random.shuffle(self, x, random)
<a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=388'>389</a> if random is None:
<a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=389'>390</a> randbelow = self._randbelow
--> <a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=390'>391</a> for i in reversed(range(1, len(x))):
<a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=391'>392</a> # pick an element in x[:i+1] with which to exchange x[i]
<a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=392'>393</a> j = randbelow(i + 1)
<a href='file:///c%3A/Program%20Files/WindowsApps/PythonSoftwareFoundation.Python.3.10_3.10.752.0_x64__qbz5n2kfra8p0/lib/random.py?line=393'>394</a> x[i], x[j] = x[j], x[i]
TypeError: object of type 'itertools.product' has no len()
deften(): for i inrange(10): yield i**2 my_2nd_iter = ten()
1
next(my_2nd_iter)
0
1 2 3 4 5 6 7 8 9 10
defgen(): yield1 yield2 yield3 yield4 yield5 for i inrange(10): yield i g = gen() list(g)
[1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Generate the perfect numbers
1 2 3 4 5 6 7 8 9 10 11 12 13 14
defdivisors(n): return [i for i inrange(1,n) if n%i==0] defis_perfect(i): returnsum(divisors(i))==i defperfect_numbers(): i = 0 whileTrue: if is_perfect(i): yield i i+=1
if can_do_thing(): do_thing() else: handle_error()
1 2 3 4
try: do_thing() except NoCanDo: handle_error()
Never:
1 2 3 4
try: do_thing() except: handle_error()
Never:
1 2 3 4
try: do_thing() except: pass
1 2 3 4 5
d = {1:2} if1in d: print(d[1]) else: print("1 not in")
2
1 2 3 4
try: print(d[1]) except KeyError: print("key not in via excepttion")
2
Files
open(filename,mode) default: read only
reading,writing,or appending
text or binary(.dat)
1 2
for line inopen("readme.txt"): print(line)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 171' in <module>
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000170?line=0'>1</a> for line in open("readme.txt"):
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000170?line=1'>2</a> print(line)
FileNotFoundError: [Errno 2] No such file or directory: 'readme.txt'
Formatting strings
% (old)
string.format()
f_strings(newest)
1 2 3 4 5 6 7 8 9 10 11 12
topping = "tomato" "my fav pizza is %s" % topping "my fav pizza is {}".format(topping) "my fav pizza is {topping}".format(topping=topping) f"my fav pizza is {topping}"
classTicTacToe: def__init__(): self._board = [ [None,None,None], [None,None,None], [None,None,None] ] self._turn = "X" defmove(self,i,j): if self._board[i][j] isnotNone: raise ValueError("square is occupied") self._board[i][j] = self._turn if self._turn == "X": self._turn = "O" else: self._turn = "X"
defwinner(board): winning_combinations = [ [(x, y), (x + a, y + b), (x + 2 * a, y + 2 * b)] for x inrange(3) for y inrange(3) for a inrange(2) for b inrange(-1, 2) ifmax(a, b) > 0 ] for w in winning_combinations: try: s = {board[x][y] for x, y in w} sq = s.pop() iflen(s) == 0and sq isnotNone: return sq except IndexError: pass
def__add__(self, other): ifisinstance(other, Vector): if self.dim == other.dim: return Vector(*[x + y for x, y inzip(self, other)]) else: raise ValueError("not the same dim") else: returnNotImplemented
We used a new operator
*[x + y for x, y in zip(self, other)]. This is called the
"splat" operator. It takes all the items in the list (or iterable more
generally) and passes them in as function arguments.
NotImplemented is a special Python value signifying
that the operation is not implemented. Here we do not allow anything to
be added to a vector except another vector.
1 2 3 4 5 6 7 8 9 10 11 12 13
def__mul__(self, other): from numbers import Number
ifisinstance(other, Number): return Vector(*[other * x for x in self]) else: returnNotImplemented
self._val = tuple(args) ifnotall(isinstance(x, numbers.Number) for x in self): raise ValueError("Input should be a sequence of numbers")
@property defdim(self): returnlen(self._val)
def__add__(self, other): ifnot self._conformable(other): returnNotImplemented return Vector(*[a + b for a, b inzip(self, other)])
def__mul__(self, other): import numbers
ifnotisinstance(other, numbers.Number): returnNotImplemented return Vector(*[a * other for a in self._val])
def__len__(self): returnlen(self._val)
def__eq__(self, other): if self._conformable(other): returnall(a == b for a, b inzip(self, other)) returnFalse
defdot(self, other): ifnot self._conformable(other): raise ValueError("cannot dot product with a non-conformable vector") returnsum(a * b for a, b inzip(self, other))
def__init__(self, **kwargs): ifnotall(isinstance(col, Vector) for col in kwargs.values()): raise TypeError("DataFrame columns are Vectors") dims = {len(v) for v in kwargs.values()} # set comprehension # set 里面的值是unique的 iflen(dims) > 1: raise ValueError("All columns must have the same dimension")
DataFrame.__init__ = __init__
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 282' in <module>
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=5'>6</a> if len(dims) > 1:
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=6'>7</a> raise ValueError("All columns must have the same dimension")
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000272?line=8'>9</a> DataFrame.__init__ = __init__
NameError: name 'DataFrame' is not defined
Inheritance
class ChildClass(ParentClass):
支持Multiple Inheritance
继承父类中的所有方法
1 2
classDataFrame(dict): 'a data set consisting of labeled vectors'
如果不加super constructor 就没有initialize the thing that hold the
object
只需要call it in init function
1 2 3 4 5 6 7 8 9 10
def__init__(self, **kwargs): ifnotall(isinstance(col, Vector) for col in kwargs.values()): raise TypeError("DataFrame columns are Vectors") dims = {len(v) for v in kwargs.values()} # set comprehension iflen(dims) > 1: raise ValueError("All columns must have the same dimension") dict.__init__(self, **kwargs) # call superconstructor
{'col1': <__main__.Vector at 0x1cf037f0e50>,
'col2': <__main__.Vector at 0x1cf037f3670>}
This __init__ function is okay, but has the deficiency
that it doesn't allow us to create DataFrames in other, more flexible
ways. For example, we can create a dict as follows:
{1: <__main__.Vector at 0x1cf037f22f0>, 2: <__main__.Vector at 0x1cf037f0640>}
1 2 3
# Not work for our Data Frame DataFrame(kv)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\Users\Jun\Documents\Courses\Stats 507\CourseNote.ipynb Cell 281' in <module>
<a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000283?line=0'>1</a> # Not work for our Data Frame
----> <a href='vscode-notebook-cell:/c%3A/Users/Jun/Documents/Courses/Stats%20507/CourseNote.ipynb#ch0000283?line=1'>2</a> DataFrame(kv)
TypeError: __init__() takes 1 positional argument but 2 were given
More General way:__init__(self, *args, **kwargs)
1 2 3 4 5 6 7 8 9 10 11
def__init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) # call superconstructor ifnotall(isinstance(col, Vector) for col in self.values()): raise TypeError("DataFrame columns are Vectors") dims = {len(v) for v in self.values()} # set comprehension iflen(dims) > 1: raise ValueError("All columns must have the same dimension")
When analyzing data, it will be helpful to iterate over each
observation. Something like:
1 2 3 4 5 6
>>> df = DataFrame(col1=Vector(1, 2), col2=Vector(3, 4)) >>> for x in df.obs(): >>> print(x) >>> ... {'col1': 1, 'col2': 3} {'col1': 2, 'col2': 4}
To accomplish this, we'll want to zip together the vectors col1 and
col2:
1
list(zip(Vector(1, 2), Vector(3, 4)))
[(1, 3), (2, 4)]
We will use this behavior to construct a second kind of iterator for
our class:
1 2 3 4
defobs(self): 'Yields each observation as column-value dict' for vals inzip(self.values()): yielddict(zip(self, vals))
Let's try it out:
1 2 3
DataFrame.obs = obs for ob in DataFrame(col1=v1, col2=v2, col3=v1).obs(): print(ob)
{'col1': <__main__.Vector object at 0x000001CF037F0E50>}
{'col1': <__main__.Vector object at 0x000001CF037F3670>}
{'col1': <__main__.Vector object at 0x000001CF037F0E50>}
defobs(self): 'Yields each observation as column-value dict' for vals inzip(*self.values()): print(list(self)) # 这里self返回的是key值 yielddict(zip(self, vals)) DataFrame.obs = obs for ob in DataFrame(col1=v1, col2=v2, col3=v1).obs(): print(ob)
list(filter(is_even,(x**2for x inrange(10)))) # Filtering a generator
[0, 4, 16, 36, 64]
Reduce
reduce an iterator to a single element
functools contains a bunch of useful functional
programming functions, including reduce
functools.reduce
By initial an accumulator and repeatedly update the accumulator
If initial value is not supplied, Python will initializes the
accumulator as acc=f(x,y), where x and y are the first two elements of
the iterator.
of the iterator is length 1, it will return that element.
Best to specify the initial value
1 2 3 4
import functools print(functools.reduce(lambda x,y: y/x, range(1,4),1)) # 1 is initial value print(functools.reduce(lambda x,y: x/y, range(1,4),1)) # 1 is initial value
1.5
0.16666666666666666
1
sum([[1,2,3],[4,5],[7,8,9]],[])
[1, 2, 3, 4, 5, 7, 8, 9]
理解reduce的运算规则
1 2 3 4 5 6 7
from functools import reduce elements = range(100) print(reduce(lambda accum,x:accum+1,elements)) elements = reversed(range(100)) print(reduce(lambda accum,x:accum+1,elements)) elements = reversed(range(100)) print(reduce(lambda accum,x:accum+1,elements,0))
99
198
100
map is in parrallel
1 2 3 4 5 6 7 8
hp1 = open("2/hp1.txt","rt") from collections import Counter defmapper(line): words = line.strip().split(" ") return Counter(words)
result = reduce(lambda x,y: x+y,map(mapper,hp1)) # This method is really slow
defenron(limit=None): with gzip.open("7/email-Enron.txt.gz","rt") as f: lines = (line.strip() for line in f ifnot line.startswith("#")) yieldfrom itertools.islice(lines,limit)