from pyspark.sql import (
functions as f,
SparkSession,
types as t
)
spark = SparkSession.builder.appName("df_join").getOrCreate()
user_data = [
["1000", "Neville Hardy", "Apple"],
["2000", "Dacia Cohen", "Alphabet"],
["3000", "Elois Cox", "Neflix"],
["4000", "Junita Meyer", "Meta"],
["5000", "Cleora Banks", "Amazon"]]
user_col = ['id', 'name', 'company']
df_user = spark.createDataFrame(data=user_data, schema=user_col)
df_user.show()
salary_data = [
["1000", "150000", "engineer"],
["2000", "240000", "manager"],
["3000", "120000", "human resource"],
["6000", "100000", "sales"]]
salary_col = ['id', 'salary', 'department']
df_salary = spark.createDataFrame(data=salary_data, schema=salary_col)
df_salary.show()
JOIN
#df_user
+----+-------------+--------+
| id| name| company|
+----+-------------+--------+
|1000|Neville Hardy| Apple|
|2000| Dacia Cohen|Alphabet|
|3000| Elois Cox| Neflix|
|4000| Junita Meyer| Meta|
|5000| Cleora Banks| Amazon|
+----+-------------+--------+
#df_salary
+----+------+--------------+
| id|salary| department|
+----+------+--------------+
|1000|150000| engineer|
|2000|240000| manager|
|3000|120000|human resource|
|6000|100000| sales|
+----+------+--------------+
df_user.join(df_salary, df_user.id ==df_salary.id, 'left')
df_user.join(df_salary,(df_user.id == df_salary.id) & (df_user.id == 1000))
df_user.join(df_salary,df_user.id == df_salary.id).where(df_user.id ==1000)