Pyspark

Posted on 2022-04-20 Edited on 2022-08-07 In Programming Views: Symbols count in article: 3.1k Reading time ≈ 3 mins.

Pyspark

1	import pyspark

1
2
3

import findspark
findspark.init()
sc = pyspark.SparkContext()

1	data=sc.textFile('16\simple.txt')

1	data.collect() # form a list of data

['Raining cats and dogs',
 'Cats eat mice, mice eat cheese',
 'Mice and cats and dogs have fleas']

1	data = sc.textFile('16\\numbers.txt')

1	data2 = data.map(int)

1	data2.collect()

[10, 23, 16, 7, 12, 0, 1, 1, 2, 3, 5, 8, -1, 42, 64, 101, -101, 3]

1 2	from operator import add data2.reduce(add)

map
filter
flatMap
sample
distinct
reduceByKey

1	data = sc.textFile('16\\numbers.txt').map(int)

1	data.collect()

[10, 23, 16, 7, 12, 0, 1, 1, 2, 3, 5, 8, -1, 42, 64, 101, -101, 3]

1	data.filter(lambda x: x%2 == 0).collect()

[10, 16, 12, 0, 2, 8, 42, 64]

sample(withReplacement,fraction,[seed])

1	data.sample(True,0.3).collect()

[23, 23, 1, 1, 64]

1	data.sample(True,0.3,1).collect()

[23, 7, 0, -1, 64, 64, 101, 3]

复杂的data：pickle module里面serializaiton

1	data = sc.textFile('16\\scientists.txt')

1	data.collect()

['John Bardeen 3.1 EE 1908',
 'Eugene Wigner 3.2 Physics 1902',
 'Albert Einstein 4.0 Physics 1879',
 'Ronald Fisher 3.25 Statistics 1890',
 'Emmy Noether 2.9 Physics 1882',
 'Leonard Euler 3.9 Mathematics 1707',
 'Jerzy Neyman 3.5 Statistics 1894',
 'Ky Fan 3.55 Mathematics 1914']

1	data.map(lambda line: line.split()).collect()

[['John', 'Bardeen', '3.1', 'EE', '1908'],
 ['Eugene', 'Wigner', '3.2', 'Physics', '1902'],
 ['Albert', 'Einstein', '4.0', 'Physics', '1879'],
 ['Ronald', 'Fisher', '3.25', 'Statistics', '1890'],
 ['Emmy', 'Noether', '2.9', 'Physics', '1882'],
 ['Leonard', 'Euler', '3.9', 'Mathematics', '1707'],
 ['Jerzy', 'Neyman', '3.5', 'Statistics', '1894'],
 ['Ky', 'Fan', '3.55', 'Mathematics', '1914']]

1	data.map(str.split).collect()

[['John', 'Bardeen', '3.1', 'EE', '1908'],
 ['Eugene', 'Wigner', '3.2', 'Physics', '1902'],
 ['Albert', 'Einstein', '4.0', 'Physics', '1879'],
 ['Ronald', 'Fisher', '3.25', 'Statistics', '1890'],
 ['Emmy', 'Noether', '2.9', 'Physics', '1882'],
 ['Leonard', 'Euler', '3.9', 'Mathematics', '1707'],
 ['Jerzy', 'Neyman', '3.5', 'Statistics', '1894'],
 ['Ky', 'Fan', '3.55', 'Mathematics', '1914']]

1 2	d = data.map(str.split).map(lambda tup: (tup[0]+' '+tup[1],float(tup[2]),int(tup[4]))) d.collect()

[('John Bardeen', 3.1, 1908),
 ('Eugene Wigner', 3.2, 1902),
 ('Albert Einstein', 4.0, 1879),
 ('Ronald Fisher', 3.25, 1890),
 ('Emmy Noether', 2.9, 1882),
 ('Leonard Euler', 3.9, 1707),
 ('Jerzy Neyman', 3.5, 1894),
 ('Ky Fan', 3.55, 1914)]

1	d.map(lambda tup: tup[1]).mean()

3.425

1 2	data = sc.textFile('16/numbers_weird.txt') data.collect()

['10 23 16', '7 12', '0', '1 1 2 3 5 8', '-1 42', '64 101 -101', '3']

1	data.map(str.split).collect()

[['10', '23', '16'],
 ['7', '12'],
 ['0'],
 ['1', '1', '2', '3', '5', '8'],
 ['-1', '42'],
 ['64', '101', '-101'],
 ['3']]

1	data.flatMap(str.split).collect()

['10',
 '23',
 '16',
 '7',
 '12',
 '0',
 '1',
 '1',
 '2',
 '3',
 '5',
 '8',
 '-1',
 '42',
 '64',
 '101',
 '-101',
 '3']

reduce
collect
count
countByKey

1	data =sc.textFile('16/simple.txt')

1	data.collect()

['Raining cats and dogs',
 'Cats eat mice, mice eat cheese',
 'Mice and cats and dogs have fleas']

1	data.flatMap(str.split).map(str.lower).map(lambda x: x.replace(',','')).map(lambda x: (x,1)).countByKey()

defaultdict(int,
            {'raining': 1,
             'cats': 3,
             'and': 3,
             'dogs': 2,
             'eat': 2,
             'mice': 3,
             'cheese': 1,
             'have': 1,
             'fleas': 1})

1	data.flatMap(str.split).map(str.lower).map(lambda x: x.replace(',','')).countByKey()

defaultdict(int,
            {'r': 1, 'c': 4, 'a': 3, 'd': 2, 'e': 2, 'm': 3, 'h': 1, 'f': 1})

1	import pyspark

1	spark = pyspark.sql.SparkSession(sc)