unstructured.io
설치
docker pull downloads.unstructured.io/unstructured-io/unstructured:latest
docker run -dt --network host -v 파일경로:파일경로 --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest
docker exec -it unstructured bash
실행
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="test_data.pdf",
strategy="hi_res",
infer_table_structure=True,
model_name="yolox"
)
for e in elements:
print(e.metadata.text_as_html)
print('--------------------------')
Before

After
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
None
--------------------------
<table><thead><tr><th></th><th>RN ol < JW/ T</th><th>i ol | S</th></tr></thead><tbody><tr><td>b3 o rio 4</td><td>x il ur N W <2 S =W of o e 2= B G £ Mk X o< N ox e} e}</td><td>of o @y g S KB</td></tr><tr><td>™ of K- K</td><td>o o < (2% &y T o MM_. oo RT® F23 - o) = bom X %N Tt N ot ot TR e} e}</td><td>~ B o = | K Moo RN | B IR T</td></tr><tr><td></td><td>| BxBa R = AN o s = N MO N _% = 9 | o e A\ go K Nom o ®Roxom o B = R e} e} e}</td><td></td></tr><tr><td>| oo X0 o</td><td>) iy F A B ) o W K w0 w_mrh -~ - 23fz.zf ®ONNED Ty T T A o= N X o dw RN FT - ”JW 38 oo TR OPM e} o</td><td>Plo T 2 Y <<</td></tr><tr><td>% - ol</td><td>T z ° %0 i %o o)) I o oy B K 1k ¥ T < @ o 0</td><td>™o o > 83 %%QUH oo M ™ AUy TR E LB Il N</td></tr><tr><td></td><td>G o R F o oK N © o B F T |F3 wr o y % o < e ~ o ) o</td><td></td></tr></tbody></table>