
df.select("Date", "StoreName").show()
spark.sql('''
select Date, StoreName
from sql_table
''').show()
df.count()
26160915
spark.sql('''
select count(*)
from sql_table
''').show()
+--------+
|count(1)|
+--------+
|26160915|
+--------+
df.filter(F.col("StoreName") == "URBAN LIQUOR").select("StoreName").show()
spark.sql('''
select StoreName
from sql_table
where StoreName = 'URBAN LIQUOR'
''').show()
df.filter(F.col("City").isNull()).select("StoreName", "City").show()
spark.sql('''
select StoreName, City
from sql_table
where City IS NULL
''').show()
df.filter(F.col("City").isNull()).select("StoreName", F.col("City").alias("City_null")).show()
spark.sql('''
select StoreName, City as City_null
from sql_table
where City IS NULL
''').show()
df.select(
"Pack",
F.when(
F.col("Pack") > 12, "over bulk"
).otherwise("unit").alias("Pack_bin")
).show()
spark.sql('''
select Pack,
CASE WHEN Pack > 12 THEN 'over bulk' ELSE 'unit'
END AS Pack_bin
from sql_table
''').show()
+----+---------+
|Pack| Pack_bin|
+----+---------+
| 12| unit|
| 12| unit|
| 12| unit|
| 12| unit|
| 12| unit|
| 6| unit|
| 24|over bulk|
| 24|over bulk|
| 6| unit|
| 12| unit|
| 24|over bulk|
| 12| unit|
| 24|over bulk|
| 6| unit|
| 24|over bulk|
| 12| unit|
| 12| unit|
| 12| unit|
| 6| unit|
| 6| unit|
+----+---------+
# Category별로 다른 CategoryName이 몇개 있는지를 알고 싶었던 것
# pyspark의 경우 이를 size + collect_set을 통해 해결(
(df
.withColumn("CategoryName_cnt", F.size(F.collect_set("CategoryName").over(W.partitionBy("Category"))))
.filter(F.col("CategoryName_cnt") >= 2)
.select("CategoryName", "Category", "CategoryName_cnt")
).show()
+--------------------+--------+----------------+
| CategoryName|Category|CategoryName_cnt|
+--------------------+--------+----------------+
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
|STRAIGHT RYE WHIS...| 1011500| 2|
+--------------------+--------+----------------+
max 를 통해 최대값을 가져와 몇개인지 판별에 사용함
(
df
.filter(F.col("Category") == 1011500)
# sql with절로 다른 테이블로 올리는 것 때문에 java 서버 오류가 나서 일부만 필터링 해서 보여드립니다.
.createOrReplaceTempView('sql_table2')
)
spark.sql('''
WITH new_sql_table AS (
SELECT CategoryName, Category, max(CategoryName_dense) over (partition by Category) AS CategoryName_cnt
FROM (
SELECT CategoryName, Category,
dense_rank() OVER (PARTITION BY Category order by CategoryName) AS CategoryName_dense
FROM sql_table2
) as t
)
SELECT CategoryName, Category, CategoryName_cnt
FROM
new_sql_table
WHERE CategoryName_cnt >= 2
''').show()
+--------------------+--------+----------------+
| CategoryName|Category|CategoryName_cnt|
+--------------------+--------+----------------+
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
|BOTTLED IN BOND B...| 1011500| 2|
+--------------------+--------+----------------+
df.withColumn("Date", F.to_date(F.col("Date"), 'MM/dd/yyyy')).select("Date").show()
spark.sql('''
SELECT TO_DATE(Date, 'MM/dd/yyyy') Date
FROM
sql_table2
''').show()
+----------+
| Date|
+----------+
|2012-05-03|
|2013-10-01|
|2015-01-15|
|2015-07-07|
|2013-08-06|
|2013-09-25|
|2012-09-11|
|2012-08-20|
|2015-08-20|
|2012-05-09|
|2012-08-13|
|2013-12-16|
|2012-12-06|
|2014-06-27|
|2013-10-29|
|2012-03-28|
|2012-11-12|
|2013-11-12|
|2014-01-02|
|2015-10-21|
+----------+
df.select(F.col("StoreNumber").cast("String"))
spark.sql('''
SELECT CAST(StoreNumber as varchar(30)) -- varchar: 가변의 문자열 타입
FROM
sql_table2
''').show()