pySpark16 - join

박성현·2024년 6월 17일

pySpark

목록 보기
16/17

from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_join").getOrCreate()

# user data
user_data = [
    ["1000", "Neville Hardy", "Apple"],
    ["2000", "Dacia Cohen", "Alphabet"],
    ["3000", "Elois Cox", "Neflix"],
    ["4000", "Junita Meyer", "Meta"],
    ["5000", "Cleora Banks", "Amazon"]]

user_col = ['id', 'name', 'company']
df_user = spark.createDataFrame(data=user_data, schema=user_col)
df_user.show()

# salary data
salary_data = [
    ["1000", "150000", "engineer"],
    ["2000", "240000", "manager"],
    ["3000", "120000", "human resource"],
    ["6000", "100000", "sales"]]

salary_col = ['id', 'salary', 'department']
df_salary = spark.createDataFrame(data=salary_data, schema=salary_col)
df_salary.show()

JOIN

#df_user
+----+-------------+--------+
|  id|         name| company|
+----+-------------+--------+
|1000|Neville Hardy|   Apple|
|2000|  Dacia Cohen|Alphabet|
|3000|    Elois Cox|  Neflix|
|4000| Junita Meyer|    Meta|
|5000| Cleora Banks|  Amazon|
+----+-------------+--------+

#df_salary
+----+------+--------------+
|  id|salary|    department|
+----+------+--------------+
|1000|150000|      engineer|
|2000|240000|       manager|
|3000|120000|human resource|
|6000|100000|         sales|
+----+------+--------------+
df_user.join(df_salary, df_user.id ==df_salary.id, 'left')

# multiple join with & 디폴트는 이너 조인 인듯 ? 
df_user.join(df_salary,(df_user.id == df_salary.id) & (df_user.id == 1000))

#where
df_user.join(df_salary,df_user.id == df_salary.id).where(df_user.id ==1000)

profile
다소Good한 데이터 엔지니어

0개의 댓글