DataFrame 을 이용하여 각 그룹별 topN 뽑기
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{rank, desc}
...
val dataSetWindow = Window.partitionBy("group_id", "subgroup_id").orderBy(desc("count"))
val rankLimit: Int = 100
val groupedDataFrame = myDataFrame.groupBy("group_id", "subgroup_id", "mydata")
val myDataSet = groupedDataFrame.count.withColumn("rank", rank.over(dataSetWindow)).where($"rank" <= rankLimit)
myDataSet.show(1000) // action
여러 파일에서 DataFrame 으로 읽기
val sparkSession = SparkSession.builder().appName("MyTest").getOrCreate()
val testDF = sparkSession.read.format("com.databricks.spark.csv")
.option("delimiter","\t").schema(testSchema)
.load("hdfs://localhost:9000/user/root/DataBase/test/{test1.log,test2.log}");
val sparkSession = SparkSession.builder().appName("MyTest").getOrCreate()
val testDF = sparkSession.read.schema(testSchema).parquet("hdfs://localhost:9000/user/root/DataBase/test/{test1.log,test2.log}");