스파크 핵심 데이터소스
핵심 서드파티 데이터소스
path='/FileStore/tables/all/*.csv'
spark.read
Out[1]: <pyspark.sql.readwriter.DataFrameReader at 0x7f6b4e3e9f40>
df = spark.read.format('csv')\
.option('mode', 'FAILFAST')\
.option('inferSchema', 'true')\
.option('path', path)\
.load()
df.write.format('csv')\
.option('mode', 'OVERWRITE')\
.option('dateFormat', 'yyyy-MM-dd')\
.option('path', '/FileStore/tables/temp/temp.csv')\
.save()
df =spark.read.format('csv')\
.option('header', 'true')\
.option('mode','FAILFAST')\
.load(path)
df.show(5)
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
536365| 85123A|WHITE HANGING HEA...| 6|12/1/2010 8:26| 2.55| 17850|United Kingdom|
536365| 71053| WHITE METAL LANTERN| 6|12/1/2010 8:26| 3.39| 17850|United Kingdom|
536365| 84406B|CREAM CUPID HEART...| 8|12/1/2010 8:26| 2.75| 17850|United Kingdom|
536365| 84029G|KNITTED UNION FLA...| 6|12/1/2010 8:26| 3.39| 17850|United Kingdom|
536365| 84029E|RED WOOLLY HOTTIE...| 6|12/1/2010 8:26| 3.39| 17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows
df.write.format('csv').mode('overwrite').option('sep','\t').save('/FileStore/tables/temp/tsv_file.tsv')
j_path='/FileStore/tables/2010_summary.json'
spark.read.format('json').option('mode', 'FAILFAST').option('inferSchema', 'true').load(j_path).show(5)
+-----------------+-------------------+-----+
DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
United States| Romania| 1|
United States| Ireland| 264|
United States| India| 69|
Egypt| United States| 24|
Equatorial Guinea| United States| 1|
+-----------------+-------------------+-----+
only showing top 5 rows
df.write.format('json').mode('overwrite').save('/FileStore/tables/temp/j_file.json')
p_path='/FileStore/tables/2010-summary.parquet'
spark.read.format('parquet').load(p_path).show(5)
+-----------------+-------------------+-----+
DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
United States| Romania| 1|
United States| Ireland| 264|
United States| India| 69|
Egypt| United States| 24|
Equatorial Guinea| United States| 1|
+-----------------+-------------------+-----+
only showing top 5 rows
df.write.format('parquet').mode('overwrite').save('/FileStore/tables/temp/p_file.parquet')
o_path='/FileStore/tables/2010-summary.orc/part_r_00000_2c4f7d96_e703_4de3_af1b_1441d172c80f_snappy.orc'
spark.read.format('orc').load(o_path).show(5)
+-----------------+-------------------+-----+
DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
United States| Romania| 1|
United States| Ireland| 264|
United States| India| 69|
Egypt| United States| 24|
Equatorial Guinea| United States| 1|
+-----------------+-------------------+-----+
only showing top 5 rows
df.write.format('orc').mode('overwrite').save('/FileStore/tables/temp/o_file.orc')
driver='org.sqlite.JDBC'
s_path='dbfs:/FileStore/tables/my_sqlite.db'
url= 'jdbc:sqlite:'+s_path
tablename='flight_info'
%fs ls /FileStore/tables
path | name | size |
---|---|---|
dbfs:/FileStore/tables/2010-summary.orc/ | 2010-summary.orc/ | 0 |
dbfs:/FileStore/tables/2010-summary.parquet/ | 2010-summary.parquet/ | 0 |
dbfs:/FileStore/tables/2010_summary.csv | 2010_summary.csv | 7121 |
dbfs:/FileStore/tables/2010_summary.json | 2010_summary.json | 21353 |
dbfs:/FileStore/tables/all/ | all/ | 0 |
dbfs:/FileStore/tables/my_sqlite.db | my_sqlite.db | 11264 |
dbfs:/FileStore/tables/my_sqlite_copy.db | my_sqlite_copy.db | 11264 |
dbfs:/FileStore/tables/temp/ | temp/ | 0 |
#파일 경로 문제 참조
#https://stackoverflow.com/questions/68202341/analysisexception-path-does-not-exist-dbfs-databricks-python-lib-python3-7-si
#https://docs.databricks.com/dev-tools/databricks-utils.html
copy_path='/FileStore/tables/my_sqlite_copy.db'
new_path= '/tmp/my_sqlite_copy.db'
dbutils.fs.cp(copy_path, new_path)
%fs ls /tmp
path | name | size |
---|---|---|
dbfs:/tmp/hive/ | hive/ | 0 |
dbfs:/tmp/my_sqlite_copy.db | my_sqlite_copy.db | 11264 |
url= 'jdbc:sqlite:'+new_path
url
Out[8]: 'jdbc:sqlite:/tmp/my_sqlite_copy.db'
dbDF = spark.read.format('jdbc').option('url', url).option('dbtable',tablename).option('driver', driver).load()
dbDF.filter("DEST_COUNTRY_NAME in ('Anguilla', 'Sweden')")
pushdownQuery="(select distinct(dest_country_name) from flight_info) as flight_info"
dbDF = spark.read.format('jdbc').option('url', url).option('dbtable',pushdownQuery).option('driver', driver).load()
스파크는 파일 크기, 파일 유형, 압축 방식에 따른 '분할 가능성'에 따라
파일이 가진 이런 유연성은 SQL 데이터베이스에도 존재하지만 몇 가지 수동 설정이 필요
dbDF = spark.read.format('jdbc').option('url', url).option('dbtable',tablename).option('driver', driver)\
.option('numPartitions',10).load()
props ={'driver':'org.sqlite.JDBC'}
#분할 기준 컬럼
colName= 'count'
#처음과 마지막 파티션 사이의 최솟값과 최댓값 (이 범위 밖의 모든 값은 첫 번째 또는 마지막 파티션에 속함)
lowerBound=0
upperBound=348113
#파티션 수
numPartitions=10
spark.read.jdbc(url, tablename, column=colName, properties=props, lowerBound=lowerBound, upperBound=upperBound,numPartitions=numPartitions )
newPath='jdbc:sqlite://tmp/new_qlite.db'
df.write.jdbc(newPath, tablename, properties=props )
spark.read.format('text').load('/FileStore/tables/2010_summary.csv').selectExpr("split(value,',') as rows").show(5)
+--------------------+
rows|
+--------------------+
[DEST_COUNTRY_NAM...|
[United States, R...|
[United States, I...|
[United States, I...|
[Egypt, United St...|
+--------------------+
only showing top 5 rows
df.select('StockCode').write.text('/FileStore/tables/tmp.txt')
%fs ls /FileStore/tables
path | name | size |
---|---|---|
dbfs:/FileStore/tables/2010-summary.orc/ | 2010-summary.orc/ | 0 |
dbfs:/FileStore/tables/2010-summary.parquet/ | 2010-summary.parquet/ | 0 |
dbfs:/FileStore/tables/2010_summary.csv | 2010_summary.csv | 7121 |
dbfs:/FileStore/tables/2010_summary.json | 2010_summary.json | 21353 |
dbfs:/FileStore/tables/all/ | all/ | 0 |
dbfs:/FileStore/tables/my_sqlite.db | my_sqlite.db | 11264 |
dbfs:/FileStore/tables/my_sqlite_copy.db | my_sqlite_copy.db | 11264 |
dbfs:/FileStore/tables/temp/ | temp/ | 0 |
dbfs:/FileStore/tables/tmp.txt/ | tmp.txt/ | 0 |
#폴더 안에 5개의 파일을 생성함
df.repartition(5).write.format('csv').save('/tmp/multiple.csv')
%fs ls /tmp/multiple.csv
path | name | size |
---|---|---|
dbfs:/tmp/multiple.csv/_SUCCESS | _SUCCESS | 0 |
dbfs:/tmp/multiple.csv/_committed_7308919149863971029 | _committed_7308919149863971029 | 463 |
dbfs:/tmp/multiple.csv/_started_7308919149863971029 | _started_7308919149863971029 | 0 |
dbfs:/tmp/multiple.csv/part-00000-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-9-1-c000.csv | part-00000-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-9-1-c000.csv | 9040894 |
dbfs:/tmp/multiple.csv/part-00001-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-10-1-c000.csv | part-00001-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-10-1-c000.csv | 9046349 |
dbfs:/tmp/multiple.csv/part-00002-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-11-1-c000.csv | part-00002-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-11-1-c000.csv | 9038812 |
dbfs:/tmp/multiple.csv/part-00003-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-13-1-c000.csv | part-00003-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-13-1-c000.csv | 9036785 |
dbfs:/tmp/multiple.csv/part-00004-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-12-1-c000.csv | part-00004-tid-7308919149863971029-29eda532-ce72-4f1b-83a7-f0340c44f34f-12-1-c000.csv | 9041847 |
#InvoiceNo값 기준으로 파티셔닝
df.limit(10).write.mode('overwrite').partitionBy('InvoiceNo').save('/tmp/partitioned-files.parquet')
display(df.limit(10))
InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country |
---|---|---|---|---|---|---|---|
536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom |
536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 | 8:26 | 2.75 | 17850 |
536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 | 8:26 | 3.39 | 17850 |
536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 | 8:26 | 3.39 | 17850 |
536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 | 8:26 | 7.65 | 17850 |
536365 | 21730 | GLASS STAR FROSTED T-LIGHT HOLDER | 6 | 12/1/2010 | 8:26 | 4.25 | 17850 |
536366 | 22633 | HAND WARMER UNION JACK | 6 | 12/1/2010 | 8:28 | 1.85 | 17850 |
536366 | 22632 | HAND WARMER RED POLKA DOT | 6 | 12/1/2010 | 8:28 | 1.85 | 17850 |
536367 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 32 | 12/1/2010 | 8:34 | 1.69 | 13047 |
%fs ls /tmp/partitioned-files.parquet
path | name | size |
---|---|---|
dbfs:/tmp/partitioned-files.parquet/InvoiceNo=536365/ | InvoiceNo=536365/ | 0 |
dbfs:/tmp/partitioned-files.parquet/InvoiceNo=536366/ | InvoiceNo=536366/ | 0 |
dbfs:/tmp/partitioned-files.parquet/InvoiceNo=536367/ | InvoiceNo=536367/ | 0 |
dbfs:/tmp/partitioned-files.parquet/_delta_log/ | _delta_log/ | 0 |
df.select(F.countDistinct('InvoiceNo')).collect()
Out[16]: [Row(count(DISTINCT InvoiceNo)=25900)]
#버켓단위로 데이터를 모아 일정 수의 파일로 저장하기
bucketNum = 10
bucketCol= 'InvoiceNo'
#기본적으로 아래 디렉터리 하위에 버켓팅 파일을 기록하므로 먼저 해당 디렉터리 생성
dbutils.fs.mkdirs('/user/hive/warehouse')
Out[9]: True
df.write.format('parquet').bucketBy(bucketNum, bucketCol).saveAsTable('bucketedFiles')
%fs ls /user/hive/warehouse/bucketedfiles/
path | name | size |
---|---|---|
dbfs:/user/hive/warehouse/bucketedfiles/_SUCCESS | _SUCCESS | 0 |
dbfs:/user/hive/warehouse/bucketedfiles/_committed_1429916160167243299 | _committed_1429916160167243299 | 8432 |
dbfs:/user/hive/warehouse/bucketedfiles/_started_1429916160167243299 | _started_1429916160167243299 | 0 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-10_00009.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-10_00009.c000.snappy.parquet | 90262 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-1_00000.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-1_00000.c000.snappy.parquet | 87741 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-2_00001.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-2_00001.c000.snappy.parquet | 74086 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-3_00002.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-3_00002.c000.snappy.parquet | 84847 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-4_00003.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-4_00003.c000.snappy.parquet | 92241 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-5_00004.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-5_00004.c000.snappy.parquet | 89909 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-6_00005.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-6_00005.c000.snappy.parquet | 84077 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-7_00006.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-7_00006.c000.snappy.parquet | 87207 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-8_00007.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-8_00007.c000.snappy.parquet | 92634 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-9_00008.c000.snappy.parquet | part-00000-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-27-9_00008.c000.snappy.parquet | 82842 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-10_00009.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-10_00009.c000.snappy.parquet | 85100 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-1_00000.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-1_00000.c000.snappy.parquet | 81427 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-2_00001.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-2_00001.c000.snappy.parquet | 91127 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-3_00002.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-3_00002.c000.snappy.parquet | 86022 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-4_00003.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-4_00003.c000.snappy.parquet | 79613 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-5_00004.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-5_00004.c000.snappy.parquet | 81113 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-6_00005.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-6_00005.c000.snappy.parquet | 90673 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-7_00006.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-7_00006.c000.snappy.parquet | 80229 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-8_00007.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-8_00007.c000.snappy.parquet | 83282 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-9_00008.c000.snappy.parquet | part-00001-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-28-9_00008.c000.snappy.parquet | 93196 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-10_00009.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-10_00009.c000.snappy.parquet | 93511 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-1_00000.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-1_00000.c000.snappy.parquet | 94358 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-2_00001.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-2_00001.c000.snappy.parquet | 84961 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-3_00002.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-3_00002.c000.snappy.parquet | 80911 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-4_00003.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-4_00003.c000.snappy.parquet | 91751 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-5_00004.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-5_00004.c000.snappy.parquet | 78403 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-6_00005.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-6_00005.c000.snappy.parquet | 82548 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-7_00006.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-7_00006.c000.snappy.parquet | 82517 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-8_00007.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-8_00007.c000.snappy.parquet | 82756 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-9_00008.c000.snappy.parquet | part-00002-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-29-9_00008.c000.snappy.parquet | 89442 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-10_00009.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-10_00009.c000.snappy.parquet | 93217 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-1_00000.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-1_00000.c000.snappy.parquet | 83924 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-2_00001.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-2_00001.c000.snappy.parquet | 94888 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-3_00002.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-3_00002.c000.snappy.parquet | 81224 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-4_00003.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-4_00003.c000.snappy.parquet | 81094 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-5_00004.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-5_00004.c000.snappy.parquet | 81804 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-6_00005.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-6_00005.c000.snappy.parquet | 88050 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-7_00006.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-7_00006.c000.snappy.parquet | 86349 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-8_00007.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-8_00007.c000.snappy.parquet | 88488 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-9_00008.c000.snappy.parquet | part-00003-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-30-9_00008.c000.snappy.parquet | 87608 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-10_00009.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-10_00009.c000.snappy.parquet | 87329 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-1_00000.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-1_00000.c000.snappy.parquet | 88483 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-2_00001.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-2_00001.c000.snappy.parquet | 81433 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-3_00002.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-3_00002.c000.snappy.parquet | 80855 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-4_00003.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-4_00003.c000.snappy.parquet | 86472 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-5_00004.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-5_00004.c000.snappy.parquet | 87299 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-6_00005.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-6_00005.c000.snappy.parquet | 85073 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-7_00006.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-7_00006.c000.snappy.parquet | 85654 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-8_00007.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-8_00007.c000.snappy.parquet | 81005 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-9_00008.c000.snappy.parquet | part-00004-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-31-9_00008.c000.snappy.parquet | 90273 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-10_00009.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-10_00009.c000.snappy.parquet | 77355 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-1_00000.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-1_00000.c000.snappy.parquet | 81979 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-2_00001.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-2_00001.c000.snappy.parquet | 80641 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-3_00002.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-3_00002.c000.snappy.parquet | 81541 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-4_00003.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-4_00003.c000.snappy.parquet | 95708 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-5_00004.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-5_00004.c000.snappy.parquet | 91008 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-6_00005.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-6_00005.c000.snappy.parquet | 81946 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-7_00006.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-7_00006.c000.snappy.parquet | 87397 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-8_00007.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-8_00007.c000.snappy.parquet | 84939 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-9_00008.c000.snappy.parquet | part-00005-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-32-9_00008.c000.snappy.parquet | 80757 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-10_00009.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-10_00009.c000.snappy.parquet | 86184 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-1_00000.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-1_00000.c000.snappy.parquet | 89709 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-2_00001.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-2_00001.c000.snappy.parquet | 88274 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-3_00002.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-3_00002.c000.snappy.parquet | 76828 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-4_00003.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-4_00003.c000.snappy.parquet | 63636 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-5_00004.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-5_00004.c000.snappy.parquet | 91554 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-6_00005.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-6_00005.c000.snappy.parquet | 84641 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-7_00006.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-7_00006.c000.snappy.parquet | 79675 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-8_00007.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-8_00007.c000.snappy.parquet | 86432 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-9_00008.c000.snappy.parquet | part-00006-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-33-9_00008.c000.snappy.parquet | 79475 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-10_00009.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-10_00009.c000.snappy.parquet | 39512 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-1_00000.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-1_00000.c000.snappy.parquet | 51921 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-2_00001.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-2_00001.c000.snappy.parquet | 47941 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-3_00002.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-3_00002.c000.snappy.parquet | 33996 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-4_00003.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-4_00003.c000.snappy.parquet | 38650 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-5_00004.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-5_00004.c000.snappy.parquet | 49519 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-6_00005.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-6_00005.c000.snappy.parquet | 33252 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-7_00006.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-7_00006.c000.snappy.parquet | 36333 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-8_00007.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-8_00007.c000.snappy.parquet | 28534 |
dbfs:/user/hive/warehouse/bucketedfiles/part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-9_00008.c000.snappy.parquet | part-00007-tid-1429916160167243299-ae8dd407-c390-4806-8bb9-1eed6d0e2e7f-34-9_00008.c000.snappy.parquet | 42641 |
데이터를 저장할 땐 파일 크기가 중요한 요소가 아니지만 읽을 땐 중요한 요소 중 하나
작은 크기의 파일 문제
큰 크기의 파일 문제
스파크 2.2버전에는 파일 크기를 제어할 수 있는 새로운 방법이 도입됨
#파일당 최대 5000개의 로우를 포함하도록 보장
df.write.option('maxRecordsPerFile',5000)
Out[17]: <pyspark.sql.readwriter.DataFrameWriter at 0x7fcf15185eb0>