JAVA에서 이미지를 크롤링해야할 일이 있어 라이브러리를 찾아보니 OpenGraph라는 것이 있었다. 이를 사용해본 후기를 기록하고자 한다.
https://github.com/johndeverall/opengraph-java
implementation 'net.sourceforge.htmlcleaner:htmlcleaner:2.6'
api 'org.springframework.cloud:spring-cloud-starter-openfeign:3.1.1'
OpenGraph는 기존에 URLConnection을 사용하고 있는데 FeignClient로 변경하여 사용하였다.
public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception {
this();
isImported = true;
URL pageURL = new URL(url);
URLConnection siteConnection = pageURL.openConnection();
Charset charset = getConnectionCharset(siteConnection);
BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset));
...
}
public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception {
this();
isImported = true;
URL pageURL = new URL(url);
URLConnection siteConnection = pageURL.openConnection();
Charset charset = getConnectionCharset(siteConnection);
BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset));
...
}
@FeignClient(name = "ImageCrawlingApiInterface" ,url = "/", configuration = {HeaderConfig.class})
public interface ImageCrawlingApiInterface {
@GetMapping(produces = {"text/html; charset=euc-kr"})
Response getImageApi(URI productUrl);
}
public class HeaderConfig {
@Bean
public RequestInterceptor requestInterceptor() {
return requestTemplate -> {
requestTemplate.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
requestTemplate.header("Accept-Encoding", "gzip, deflate, br");
requestTemplate.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7");
requestTemplate.header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36");
};
}
}
public String crawlingImg(String url){
Response response = imageCrawlingApiInterface.getProductImageApi(URI.create(url.replace("?&","?")));
OpenGraph graph = null;
try {
// og태그 정보를 담을 OpenGraph객체를 선언한다.
graph = new OpenGraph(true,response);
} catch (Exception e) {
e.printStackTrace();
return null;
}
return graph.getContent("image") ;
}
참고 블로그