reduceByKey add
import pyspark
from operator import add
sc = pyspark.SparkContext.getOrCreate()
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.reduceByKey(add).collect())
groupByKey, mapValues
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.groupByKey().mapValues(len).collect()
sorted(rdd.groupByKey().mapValues(list).collect())
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
sc.parallelize(tmp).sortByKey().first()
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.keys()
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())