df.show()
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NULL| NULL|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
#다 지워
df.na.drop() , df.na.drop(how='any')
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
# thresh = n , null 포함 컬럼 n이상 포함 시 지워
df.na.drop(thresh=2)
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
#특정 칼럼에 null 있으면 지워
df.na.drop(subset=["salary"])
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
df.show()
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NULL| NULL|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
#string 칼럼들은 전부 enginner로 채워
df.na.fill("engineer")
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| engineer| NULL|
|3000| engineer| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
# int 타입 칼럼은 0으로 채워
df.na.fill(0)
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NULL| 0|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
#특정 칼럼만 처리
df.na.fill("NA", subset=["occupation"])
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NA| NULL|
|3000| NA| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
df.select()로 칼럼 평균값 구해서 대체
df.select() 리턴타입도 dataFrame이기 때문에, .collect()로 받아서
.collect()[0][0]으로 처리 가능, .collect()로 변환 시 dataFrame에 위치로 값을 받아 올수 있음 .
df.show()
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NULL| NULL|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+
#평균구해서 fill
mv = df.select(f.mean(f.col('salary'))).collect()[0][0]
df.na.fill(mv, subset=['salary'])
+----+----------+------+
| id|occupation|salary|
+----+----------+------+
|1000| engineer|100000|
|2000| NULL| 87500|
|3000| NULL| 50000|
|4000| teacher| 80000|
|5000| banker|120000|
+----+----------+------+