import time
%%time
df = spark.read.csv("Iowa_Liquor_Sales.csv", header=True, inferSchema=True)
# 결과값:
# CPU times: user 1.5 s, sys: 157 ms, total: 1.66 s
# Wall time: 3min 36s
%%time
df_parquet = spark.read.parquet("/content/drive/MyDrive/Colab Notebooks/data_parquet", header=True)
# 결과값:
# CPU times: user 7.83 ms, sys: 137 µs, total: 7.97 ms
# Wall time: 398 ms
from pyspark.sql import functions as F
%%time
df1 = df.filter(F.col("City") == "MAXWELL")
# 결과값:
# CPU times: user 2.58 ms, sys: 0 ns, total: 2.58 ms
# Wall time: 117 ms
%%time
df_parquet1 = df.filter(F.col("City") == "MAXWELL")
# 결과값:
# CPU times: user 2.16 ms, sys: 0 ns, total: 2.16 ms
# Wall time: 68.3 ms
%%time
df.count()
# 결과값:
# CPU times: user 573 ms, sys: 58.3 ms, total: 631 ms
# Wall time: 58.4 s
# 26160915
# spark는 Lazy 처리방식
%%time
df_parquet.count()
# 결과값:
# CPU times: user 23 ms, sys: 2.59 ms, total: 25.5 ms
# Wall time: 3.27 s
# 26160915