private static List<ArticleDtoForSorting> parseArticles(String response) {
ObjectMapper objectMapper = new ObjectMapper();
List<ArticleDtoForSorting> articles = new ArrayList<>();
try {
JsonNode rootNode = objectMapper.readTree(response);
JsonNode itemsNode = rootNode.path("items");
if (itemsNode.isArray()) {
for (JsonNode item : itemsNode) {
ArticleDtoForSorting article = objectMapper.treeToValue(item, ArticleDtoForSorting.class);
articles.add(article);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return articles;
}
items로 들어오는 기사들을 맵핑해주는 방식
1)JSON 파싱 시작->
objectMapper.readTree(response)로 JSON 문자열을 파싱하여 트리 구조로 변환한다.
2)items" 노드 찾기->
rootNode.path("items")로 JSON에서 "items" 키에 해당하는 노드를 찾는다.
3)각 항목을 ArticleDtoForSorting 객체로 변환->
objectMapper.treeToValue()를 사용해 JSON 노드를 ArticleDtoForSorting 객체로 변환한다.
public List<ReportInfo> extractReportInfoFromHtml(String html) throws InterruptedException {
List<ReportInfo> reportInfos = new ArrayList<>();
Document doc = Jsoup.parse(html);
Elements rows = doc.select("table.tbList tbody tr");
/*URL 접속량을 제어하기 위한 변수
URL 접속 한 번당 count++, count가 100번될 때마다 1분간 작업을 멈춘다.
작업을 빠르게 진행하면 IP 차단됨
*/
int count = 0;
for (Element row : rows) {
ReportInfo reportInfo = new ReportInfo();
count += 1;
if (count % 100 == 0) {
Thread.sleep(ONE_MINUTE);
}
String companyName = extractCompanyNameFromRow(row);
reportInfo.setCompanyName(companyName);
String reportName = row.select("td").get(2).text();
if (reportName.contains(REPORT_EXCLUSION_KEYWORDS[0]) || reportName.contains(REPORT_EXCLUSION_KEYWORDS[1])) {
continue;
}
populateReportInfoFromRow(row, reportInfo, reportInfos);
Thread.sleep(ONE_SECOND);
}
return reportInfos;
}
public static String extractCompanyNameFromRow(Element row) {
return row.select("td").get(1).text().split(" ")[1];
}
private void populateReportInfoFromRow(Element row, ReportInfo reportInfo, List<ReportInfo> reportInfos) {
reportInfo.setReportName(row.select("td").get(2).text());
reportInfo.setReportLink(row.select("td").get(2).select("a").attr("href"));
reportInfo.setSubmissionDate(row.select("td").get(4).text());
String dcmNum = extractDocumentNumberFromUrl(reportInfo.getReportLink());
reportInfo.setDocumentNumber(dcmNum);
reportInfos.add(reportInfo);
}
<div class="tbListInner">
<table class="tbList" summary="공시서류검색에 대한 번호, 공시대상회사, 보고서명, 제출인, 접수일자, 비고 등을 알리는 표입니다.">
<caption>공시서류검색 목록</caption>
<colgroup id="colgropup">
<col style="width:6%">
<col style="width:20%">
<col style="width:auto">
<col style="width:13%">
<col style="width:11%">
<col style="width:8%">
</colgroup>
<thead>
<tr id="tr">
<th scope="row"><label for="inpSample00">번호</label></th>
<th scope="row"><label for="inpSample00">공시대상회사</label></th>
<th scope="row"><label for="inpSample00">보고서명</label></th>
<th scope="row"><label for="inpSample00">제출인</label></th>
<th scope="row"><label for="inpSample00">접수일자</label></th>
<th scope="row"><label for="inpSample00">비고</label></th>
</tr>
</thead>
<tbody id="tbody">
<tr>
<td>
501
</td>
<td class="tL">
<span class="innerWrap">
<span class="tagCom_etc" title="기타법인" style="cursor:default">기</span>
<a href="javascript:openCorpInfoNew('00969642', 'winCorpInfo', '/dsae001/selectPopup.ax');"
title="에스이그린에너지 기업개황 새창">
에스이그린에너지
</a>
</span>
</td>
<td class="tL">
<a href="/dsaf001/main.do?rcpNo=20240513000307" id="r_20240513000307"
onclick="openReportViewer('20240513000307',''); return false;" title="분기보고서 공시뷰어 새창">
분기보고서
(2024.03)
</a>
이렇게 들어오는 걸 Elements rows = doc.select("#reportBody tr");이렇게 해도 되긴 하다.