Pyspark

Pyspark

1
import pyspark
1
2
3
import findspark
findspark.init()
sc = pyspark.SparkContext()
1
data=sc.textFile('16\simple.txt')
1
data.collect() # form a list of data
['Raining cats and dogs',
 'Cats eat mice, mice eat cheese',
 'Mice and cats and dogs have fleas']
1
data = sc.textFile('16\\numbers.txt')
1
data2 = data.map(int)
1
data2.collect()
[10, 23, 16, 7, 12, 0, 1, 1, 2, 3, 5, 8, -1, 42, 64, 101, -101, 3]
1
2
from operator import add
data2.reduce(add)
196
  • map
  • filter
  • flatMap
  • sample
  • distinct
  • reduceByKey
1
data = sc.textFile('16\\numbers.txt').map(int)
1
data.collect()
[10, 23, 16, 7, 12, 0, 1, 1, 2, 3, 5, 8, -1, 42, 64, 101, -101, 3]
1
data.filter(lambda x: x%2 == 0).collect()
[10, 16, 12, 0, 2, 8, 42, 64]

sample(withReplacement,fraction,[seed])

1
data.sample(True,0.3).collect()
[23, 23, 1, 1, 64]
1
data.sample(True,0.3,1).collect()
[23, 7, 0, -1, 64, 64, 101, 3]

复杂的data:pickle module里面serializaiton

1
data = sc.textFile('16\\scientists.txt')
1
data.collect()
['John Bardeen 3.1 EE 1908',
 'Eugene Wigner 3.2 Physics 1902',
 'Albert Einstein 4.0 Physics 1879',
 'Ronald Fisher 3.25 Statistics 1890',
 'Emmy Noether 2.9 Physics 1882',
 'Leonard Euler 3.9 Mathematics 1707',
 'Jerzy Neyman 3.5 Statistics 1894',
 'Ky Fan 3.55 Mathematics 1914']
1
data.map(lambda line: line.split()).collect()
[['John', 'Bardeen', '3.1', 'EE', '1908'],
 ['Eugene', 'Wigner', '3.2', 'Physics', '1902'],
 ['Albert', 'Einstein', '4.0', 'Physics', '1879'],
 ['Ronald', 'Fisher', '3.25', 'Statistics', '1890'],
 ['Emmy', 'Noether', '2.9', 'Physics', '1882'],
 ['Leonard', 'Euler', '3.9', 'Mathematics', '1707'],
 ['Jerzy', 'Neyman', '3.5', 'Statistics', '1894'],
 ['Ky', 'Fan', '3.55', 'Mathematics', '1914']]
1
data.map(str.split).collect()
[['John', 'Bardeen', '3.1', 'EE', '1908'],
 ['Eugene', 'Wigner', '3.2', 'Physics', '1902'],
 ['Albert', 'Einstein', '4.0', 'Physics', '1879'],
 ['Ronald', 'Fisher', '3.25', 'Statistics', '1890'],
 ['Emmy', 'Noether', '2.9', 'Physics', '1882'],
 ['Leonard', 'Euler', '3.9', 'Mathematics', '1707'],
 ['Jerzy', 'Neyman', '3.5', 'Statistics', '1894'],
 ['Ky', 'Fan', '3.55', 'Mathematics', '1914']]
1
2
d = data.map(str.split).map(lambda tup: (tup[0]+' '+tup[1],float(tup[2]),int(tup[4])))
d.collect()
[('John Bardeen', 3.1, 1908),
 ('Eugene Wigner', 3.2, 1902),
 ('Albert Einstein', 4.0, 1879),
 ('Ronald Fisher', 3.25, 1890),
 ('Emmy Noether', 2.9, 1882),
 ('Leonard Euler', 3.9, 1707),
 ('Jerzy Neyman', 3.5, 1894),
 ('Ky Fan', 3.55, 1914)]
1
d.map(lambda tup: tup[1]).mean()
3.425
1
2
data = sc.textFile('16/numbers_weird.txt')
data.collect()
['10 23 16', '7 12', '0', '1 1 2 3 5 8', '-1 42', '64 101 -101', '3']
1
data.map(str.split).collect()
[['10', '23', '16'],
 ['7', '12'],
 ['0'],
 ['1', '1', '2', '3', '5', '8'],
 ['-1', '42'],
 ['64', '101', '-101'],
 ['3']]
1
data.flatMap(str.split).collect()
['10',
 '23',
 '16',
 '7',
 '12',
 '0',
 '1',
 '1',
 '2',
 '3',
 '5',
 '8',
 '-1',
 '42',
 '64',
 '101',
 '-101',
 '3']
  • reduce
  • collect
  • count
  • countByKey
1
data  =sc.textFile('16/simple.txt')
1
data.collect()
['Raining cats and dogs',
 'Cats eat mice, mice eat cheese',
 'Mice and cats and dogs have fleas']
1
data.flatMap(str.split).map(str.lower).map(lambda x: x.replace(',','')).map(lambda x: (x,1)).countByKey()
defaultdict(int,
            {'raining': 1,
             'cats': 3,
             'and': 3,
             'dogs': 2,
             'eat': 2,
             'mice': 3,
             'cheese': 1,
             'have': 1,
             'fleas': 1})
1
data.flatMap(str.split).map(str.lower).map(lambda x: x.replace(',','')).countByKey()
defaultdict(int,
            {'r': 1, 'c': 4, 'a': 3, 'd': 2, 'e': 2, 'm': 3, 'h': 1, 'f': 1})
1
import pyspark
1
spark = pyspark.sql.SparkSession(sc)