sc = SparkContext('local', 'assignment') baserdd = sc.textFile('file:///home/ec2-user/RDD/grade.txt') # 경로 설정 유의 baserdd.collect()
['Mathew, science, grade-3, 45',
'Mathew, history, grade-2, 55',
'Mark, maths, grade-2, 23',
'Mark, science, grade-1, 76',
'John, history, grade-1, 14',
'John, maths, grade-2, 74',
'Lisa, science, grade-1, 24',
'Lisa, history, grade-3, 86',
'Andrew, maths, grade-1, 34',
'Andrew, science, grade-3, 26',
'Andrew, history, grade-1, 74',
'Mathew, science, grade-2, 55',
'Mathew, history, grade-2, 87',
'Mark, maths, grade-1, 92',
'Mark, science, grade-2, 12',
'John, history, grade-1, 67',
'John, maths, grade-1, 35',
'Lisa, science, grade-2, 24',
'Lisa, history, grade-2, 98',
'Andrew, maths, grade-1, 26',
'Andrew, science, grade-3, 44',
'Andrew, history, grade-2, 77']
(단, 점수는 int형으로)
b0 = baserdd.map(lambda line: line.split(", ")) #erdd = baserdd.map(lambda line[3]: int(line[3]) ) b0.collect()
[['Mathew', 'science', 'grade-3', '45'],
['Mathew', 'history', 'grade-2', '55'],
['Mark', 'maths', 'grade-2', '23'],
['Mark', 'science', 'grade-1', '76'],
['John', 'history', 'grade-1', '14'],
b1 = b0.map(lambda line: line[:-1] + [int(line[-1])]) b1.collect()
[['Mathew', 'science', 'grade-3', 45],
['Mathew', 'history', 'grade-2', 55],
['Mark', 'maths', 'grade-2', 23],
['Mark', 'science', 'grade-1', 76],
['John', 'history', 'grade-1', 14],
b2 = b1.map(lambda x: (x[1],1)) b2.collect()
[('science', 1),
('history', 1),
('maths', 1),
('science', 1),
b2 = b2.reduceByKey(lambda x, y: x+y) b2.collect()
[('science', 8), ('history', 8), ('maths', 6)]
이름이 같아도 학년이 다르면 다른 사람임
b5 = b1.map(lambda x: ((x[0], x[2]), (x[3], 1))) b5.collect()
[(('Mathew', 'grade-3'), (45, 1)),
(('Mathew', 'grade-2'), (55, 1)),
(('Mark', 'grade-2'), (23, 1)),
(('Mark', 'grade-1'), (76, 1)),
(('John', 'grade-1'), (14, 1)),
(('John', 'grade-2'), (74, 1)),
(('Lisa', 'grade-1'), (24, 1)),
(('Lisa', 'grade-3'), (86, 1)),
(('Andrew', 'grade-1'), (34, 1)),
(('Andrew', 'grade-3'), (26, 1)),
(('Andrew', 'grade-1'), (74, 1)),
(('Mathew', 'grade-2'), (55, 1)),
(('Mathew', 'grade-2'), (87, 1)),
(('Mark', 'grade-1'), (92, 1)),
(('Mark', 'grade-2'), (12, 1)),
(('John', 'grade-1'), (67, 1)),
(('John', 'grade-1'), (35, 1)),
(('Lisa', 'grade-2'), (24, 1)),
(('Lisa', 'grade-2'), (98, 1)),
(('Andrew', 'grade-1'), (26, 1)),
(('Andrew', 'grade-3'), (44, 1)),
(('Andrew', 'grade-2'), (77, 1))]
totalByName = b5.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1])) totalByName.collect()
[(('Mathew', 'grade-3'), (45, 1)),
(('Mathew', 'grade-2'), (197, 3)),
(('Mark', 'grade-2'), (35, 2)),
(('Mark', 'grade-1'), (168, 2)),
(('John', 'grade-1'), (116, 3)),
(('John', 'grade-2'), (74, 1)),
(('Lisa', 'grade-1'), (24, 1)),
(('Lisa', 'grade-3'), (86, 1)),
(('Andrew', 'grade-1'), (134, 3)),
(('Andrew', 'grade-3'), (70, 2)),
(('Lisa', 'grade-2'), (122, 2)),
(('Andrew', 'grade-2'), (77, 1))]
averageByName = totalByName.mapValues(lambda x: x[0] / x[1]) averageByName.collect()
[(('Mathew', 'grade-3'), 45.0),
(('Mathew', 'grade-2'), 65.66666666666667),
(('Mark', 'grade-2'), 17.5),
(('Mark', 'grade-1'), 84.0),
(('John', 'grade-1'), 38.666666666666664),
(('John', 'grade-2'), 74.0),
(('Lisa', 'grade-1'), 24.0),
(('Lisa', 'grade-3'), 86.0),
(('Andrew', 'grade-1'), 44.666666666666664),
(('Andrew', 'grade-3'), 35.0),
(('Lisa', 'grade-2'), 61.0),
(('Andrew', 'grade-2'), 77.0)]