- Maven을 이용해 jsoup를 테스트한 결과를 자바 어플리케이션으로 구현함.
- 리디북스 상품페이지의 html을 크롤링하여 프로젝트의 책 데이터베이스를 만듦.
- 팀원이 코모란 라이브러리를 이용해 검색키워드를 뽑아내어 데이터베이스에 추가함.
package bit.hibooks.java.app;
public class BookDataUser {
public static void main(String[] args) {
BookDataManager bdm = new BookDataManagerImpl();
String url = "https://ridibooks.com/category/books/110?&page=";
int pageNum = 1;
while(true) {
bdm.insertItemInfo(url, pageNum, 500);
if(pageNum == 110) break;
pageNum += 1;
}
bdm.closeCon();
}
}
package bit.hibooks.java.app;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import bit.hibooks.domain.Book;
public class BookDataManagerImpl implements BookDataManager {
Connection con;
BookDataManagerImpl(){
String url = "jdbc:oracle:thin:@localhost:1521:JAVA";
try {
Class.forName("oracle.jdbc.driver.OracleDriver");
con = DriverManager.getConnection(url, "board", "java");
}catch(ClassNotFoundException cnfe) {
System.out.println("#오라클 드라이버 인식 실패");
}catch(SQLException se) {
System.out.println("#testConnectionJdbc() exception: " + se);
}
}
int numSuccess = 0;
int numFail = 0;
private void insert(Book book) {
String sql = "insert into BOOK values(BOOK_SEQ.nextval, ?,?,?,?,?,?,?,?,?)";
PreparedStatement pstmt= null;
try {
pstmt = con.prepareStatement(sql);
pstmt.setString(1, book.getB_itemId());
pstmt.setString(2, book.getB_title());
pstmt.setString(3, book.getB_img());
pstmt.setDouble(4, book.getB_rate());
pstmt.setString(5, book.getB_writer());
pstmt.setString(6, book.getB_publisher());
pstmt.setLong(7, book.getB_price());
pstmt.setString(8, book.getB_desc());
pstmt.setLong(9, book.getB_cate());
pstmt.executeUpdate();
numSuccess += 1;
}catch(SQLException se) {
System.out.println("입력실패 상품아이디 : "+book.getB_itemId() + ", 원인 : "+se);
numFail += 1;
}finally {
try {
if(pstmt != null) pstmt.close();
} catch (SQLException e) {
}
}
}
@Override
public void closeCon() {
System.out.println("입력성공횟수 : "+ numSuccess +", 입력실패횟수 : "+ numFail);
try {
if(con != null) con.close();
} catch (SQLException e) {
}
}
@Override
public void insertItemInfo(String url, int pageNum, long cate) {
Document doc = null;
String urlR = url + pageNum;
try {
doc = Jsoup.connect(urlR).get();
} catch (IOException e) {
e.printStackTrace();
}
Elements items = doc.select("div.book_thumbnail > a");
Book book = null;
for(Element item: items) {
String href = item.attr("href");
String[] temp = href.split("/");
String itemId = temp[2].trim();
try {
book = getItemInfo(itemId);
book.setB_cate(cate);
insert(book);
}catch(NullPointerException ne) {
System.out.println("파싱오류 : " + itemId);
}
}
}
private Book getItemInfo(String itemId) throws NullPointerException {
Document doc = null;
Book book = new Book();
String url = "https://ridibooks.com/books/" + itemId;
try {
doc = Jsoup.connect(url).get();
}catch(IOException e) {
e.printStackTrace();
}
Elements metaTags = doc.select("meta[property]");
for(Element meta: metaTags) {
if(meta.attr("property").equals("og:title")){
book.setB_title(meta.attr("content"));
}
if(meta.attr("property").equals("og:image")){
book.setB_img(meta.attr("content"));
}
if(meta.attr("property").equals("books:rating:value")){
double b_rate = Double.parseDouble(meta.attr("content").trim());
book.setB_rate(b_rate);
}
}
String b_itemId = itemId.trim();
book.setB_itemId(b_itemId);
String writer =doc.selectFirst("p.metadata.metadata_writer").text();
int writerDiv=writer.indexOf("저");
String b_writer=writer.substring(0, writerDiv);
String b_translator=writer.substring(writerDiv+1);
book.setB_writer(b_writer);
book.setB_translator(b_translator);
book.setB_publisher(doc.selectFirst("a.publisher_detail_link").text());
String price = (doc.selectFirst("tr.selling_price_row > td.book_price > span").text()).trim();
try {
long b_price = Long.parseLong(price.replace(",", ""));
book.setB_price(b_price);
}catch(NumberFormatException ne) {
ne.printStackTrace();
}
book.setB_desc(doc.selectFirst("div #introduce_book > p").text());
return book;
}
}
private Book getItemInfo(String itemId) throws NullPointerException {
Document doc = null;
Book book = new Book();
String url = "https://ridibooks.com/books/" + itemId;
try {
doc = Jsoup.connect(url).get();
}catch(IOException e) {
e.printStackTrace();
}
Elements metaTags = doc.select("meta[property]");
for(Element meta: metaTags) {
if(meta.attr("property").equals("og:title")){
book.setB_title(meta.attr("content"));
}
if(meta.attr("property").equals("og:image")){
book.setB_img(meta.attr("content"));
}
if(meta.attr("property").equals("books:rating:value")){
double b_rate = Double.parseDouble(meta.attr("content").trim());
book.setB_rate(b_rate);
}
if(meta.attr("property").contentEquals("keywords")) {
String keywords = meta.attr("content");
String [] keyarray = keywords.split(",");
String b_cate2=keyarray[3];
book.setB_cate2(b_cate2);
}
}
String keywords=null;
Elements metaName = doc.select("meta[name]");
for (Element meta: metaName) {
if(meta.attr("name").equals("keywords")) {
keywords = meta.attr("content");
String [] keyarray = keywords.split(",");
String b_cate2=keyarray[3];
book.setB_cate2(b_cate2);
}
}
Komoran komoran = new Komoran(DEFAULT_MODEL.STABLE);
String b_desc= doc.selectFirst("div #introduce_book > p").text();
KomoranResult analyzeResultList= komoran.analyze(b_desc);
ArrayList<String> list=(ArrayList<String>)analyzeResultList.getNouns();
ArrayList<String> nouns = new ArrayList<String>();
for (String noun:list) {
if(!nouns.contains(noun))
nouns.add(noun);
}
StringBuilder sb= new StringBuilder();
sb.append(keywords);
sb.append(",");
for(String noun:nouns) {
sb.append(noun);
sb.append(",");
}
String b_keyword= sb.toString();
book.setB_keyword(b_keyword);
String b_itemId = itemId.trim();
book.setB_itemId(b_itemId);
String writer =doc.selectFirst("p.metadata.metadata_writer").text();
int writerDiv=writer.lastIndexOf("저");
String b_writer=writer.substring(0, writerDiv+1);
book.setB_writer(b_writer);
String b_translator=writer.substring(writerDiv+1);
book.setB_translator(b_translator);
book.setB_publisher(doc.selectFirst("a.publisher_detail_link").text());
String price = (doc.selectFirst("tr.selling_price_row > td.book_price > span").text()).trim();
try {
long b_price = Long.parseLong(price.replace(",", ""));
book.setB_price(b_price);
}catch(NumberFormatException ne) {
ne.printStackTrace();
}
book.setB_desc(doc.selectFirst("div #introduce_book > p").text());
return book;
}
안녕하세요. 위에 포스팅 참조해서 상세페이지에서 메타 뽑아오는 것은 잘 참조 했습니다. 혹시 검색 화면에서 리스트 뽑아오는 것도 작업하신것이 있으면 팁좀 얻을 수 있을까해서 문의드려요.