사람인 척 클릭도 하고 로그인도 하는 등 위장하는 크롤러
너무 빠르게 페이지에 접근하는 것을 막기 위해 중간중간 잠시 쉬는 것도 설정
크로미움 브라우저도 설치
npm i puppeteer
//index.js
const parse = require('csv-parse/lib/sync');
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
const browser = await puppeteer.launch({ headless: false }); //브라우저 띄우기
const page = await browser.newPage(); //페이지 띄우기
const page2 = await browser.newPage();
const page3 = await browser.newPage();
await page.goto('https://zerocho.com'); //페이지 접속
await page.goto('https://naver.com');
await page.goto('https://google.com');
await page.waitFor(3000); //3초 기다리기
await page2.waitFor(1000);
await page3.waitFor(2000);
await page.close(); //페이지 닫기
await page2.close();
await page3.close();
await browser.close(); //브라우저 닫기
}
crawler();
const browser = await puppeteer.launch({ headless: true });
const browser = await puppeteer.launch({ headless: process.env.NODE_ENV ==='production' }); //개발시에만 화면이 보이도록
const parse = require('csv-parse/lib/sync');
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
const browser = await puppeteer.launch({ headless: false });
const [page, page2, page3] = await Promise.all([ //한번에 페이지 3개 띄우기
browser.newPage(),
browser.newPage(),
browser.newPage(),
])
await Promise.all([ //한번에 주소 접근
page.goto('https://zerocho.com'),
page2.goto('https://naver.com'),
page3.goto('https://google.com'),
]);
await Promise.all([ //한번에 기다리기
await page.waitFor(3000),
await page2.waitFor(1000),
await page3.waitFor(2000),
])
await page.close();
await page2.close();
await page3.close();
await browser.close();
}
Promise로 페이지를 동시에 조작
//data.csv
타이타닉,https://movie.naver.com/movie/bi/mi/basic.nhn?code=18847
아바타,https://movie.naver.com/movie/bi/mi/basic.nhn?code=62266
매트릭스,https://movie.naver.com/movie/bi/mi/basic.nhn?code=24452
반지의 제왕,https://movie.naver.com/movie/bi/mi/basic.nhn?code=31794
어벤져스,https://movie.naver.com/movie/bi/mi/basic.nhn?code=72363
겨울왕국,https://movie.naver.com/movie/bi/mi/basic.nhn?code=100931
트랜스포머,https://movie.naver.com/movie/bi/mi/basic.nhn?code=61521
해리 포터,https://movie.naver.com/movie/bi/mi/basic.nhn?code=30688
다크나이트,https://movie.naver.com/movie/bi/mi/basic.nhn?code=62586
캐리비안의 해적,https://movie.naver.com/movie/bi/mi/basic.nhn?code=37148
//index.js
const parse = require('csv-parse/lib/sync');
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
try { //async는 내부를 try와 catch로 감싸야한다
const browser = await puppeteer.launch({ headless: false });
await Promise.all(records.map(async (r, i) => {
try { //여기도 try에 내용 적기
const page = await browser.newPage();
await page.goto(r[1]); //r[1]은 data.csv에 있는 링크들
const scoreEl = await page.$('.score.score_left .star_score');
if (scoreEl) { //지정한 태그(평점)을 제대로 찾았는지 확인
//scoreEl은 태그를 감싸고있는 핸들러
//핸들러를 evaluate에 넣어줘야한다
const text = await page.evaluate(tag => tag.textContent, scoreEl); //텍스트 컨텐트를 text에 저장
console.log(r[0], '평점', text.trim());
}
await page.waitFor(3000); //봇인 것을 안 들키기위해
await page.close();
}
catch (e) {
console.error(e);
}
}));
await browser.close();
} catch (e) { //에러
console.error(e);
}
}
파싱의 반대 스트링이파이
npm csv-stringify
//index.js
const parse = require('csv-parse/lib/sync');
const stringify = require('csv-stringify/lib/sync'); //stringify
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
try {
const result = []; //결과 배열 생성
const browser = await puppeteer.launch({ headless: false });
await Promise.all(records.map(async (r, i) => {
try {
const page = await browser.newPage();
await page.goto(r[1]);
const scoreEl = await page.$('.score.score_left .star_score');
if (scoreEl) {
const text = await page.evaluate(tag => tag.textContent, scoreEl);
console.log(r[0], '평점', text.trim());
result.push([r[0], r[1], text.trim()]); //2차원 배열로 push
}
await page.close();
const str = stringify(result); //2차 배열을 문자열로 만들어주기
fs.writeFileSync('csv/result.csv', str); // result.csv에 적어주기
}
catch (e) {
console.error(e);
}
}));
await browser.close(); //브라우저 닫기
} catch (e) {
console.error(e);
}
}
result[i] = [r[0], r[1], text.trim()]; //배열에 입력할 때 순서대로
await Promise.all(records.map(async (r, i) => {})
에서 인덱스를 가져오기 때문에 가능//index.js
await Promise.all(records.map(async (r, i) => {
try {
const page = await browser.newPage();
await page.goto(r[1]);
const scoreEl = await page.$('.score.score_left .star_score');
const scoreEl2 = await page.$('.score.score_left .star_score');
const scoreEl3 = await page.$('.score.score_left .star_score');
if (scoreEl) {
const text = await page.evaluate(tag => tag.textContent, scoreEl);
const text = await page.evaluate(tag => tag.textContent, scoreEl2);
const text = await page.evaluate(tag => tag.textContent, scoreEl3);
}
아래와 같이 깔끔하게 변경 가능
//index.js
await Promise.all(records.map(async (r, i) => {
try {
const page = await browser.newPage();
await page.goto(r[1]);
//const text로 evaluate한 데이타를 여기서 받는다
const text = await page.evaluate(() => { //evaluate먼저
const score = document.querySelector('.score.score_left .star_score'); //바로 dom에 접근 가능
if (score) { //score있으면
return score.textContent; //리턴
});
//index.js
const result = await page.evaluate(() => {
const score = document.querySelector('.score.score_left .star_score');
const score2 = document.querySelector('.score.score_left .star_score');
if (score) {
return {
score: score.textContent,
score2: score2.textContent,
};
}
});
result.score, result.score2
//index.js
const parse = require('csv-parse/lib/sync');
const stringify = require('csv-stringify/lib/sync'); //stringify
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
try {
const result = [];
const browser = await puppeteer.launch({ headless: false });
await Promise.all(records.map(async (r, i) => {
try {
const page = await browser.newPage();
await page.goto(r[1]);
//수정된 부분
//const 태그핸들러 = await page.$(선택자);
const text = await page.evaluate(() => {
const score = document.querySelector('.score.score_left .star_score');
if (score) {
return score.textContent;
}
});
if (text) {
console.log(r[0], '평점', text.trim());
result[i] = [r[0], r[1], text.trim()];
}
await page.waitFor(3000);
await page.close();
const str = stringify(result); //2차 배열을 문자열로 만들어주기
fs.writeFileSync('csv/result.csv', str); // result.csv에 적어주기
} catch (e) {
console.error(e);
}
}));
await browser.close();
} catch (e) {
console.error(e);
}
}
crawler();
navigator.userAgent
console에 치면 브라우저 정보가 나온다
//index.js
const parse = require('csv-parse/lib/sync');
const stringify = require('csv-stringify/lib/sync'); //stringify
const fs = require('fs');
const puppeteer = require('puppeteer');
const csv = fs.readFileSync('csv/data.csv');
const records = parse(csv.toString('utf-8'));
const crawler = async () => {
try {
const result = [];
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage(); //page는 하나만 사용
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.0 Safari/537.36'); //navigator.userAgent로 나온 것으로 설정
//Promise.all대신 for of 문 사용
//내부의 try catch제거
for (const [i, r] of records.entries()) {
await page.goto(r[1]);
console.log(await page.evaluate('navigator.userAgent')); //userAgent가 잘 적용되었는지 확인
const text = await page.evaluate(() => {
const score = document.querySelector('.score.score_left .star_score');
if (score) {
return score.textContent;
}
});
if (text) {
console.log(r[0], '평점', text.trim());
result[i] = [r[0], r[1], text.trim()];
}
await page.waitFor(3000); //페이지당 3초씩 기다리기
}
await page.close();
const str = stringify(result);
fs.writeFileSync('csv/result.csv', str);
await browser.close();
} catch (e) {
console.error(e);
}
}
crawler();
크롤링할 페이지가 늘어날 수록 waitFor시간때문에 크롤링 시간⬆︎
컬럼A
제목
타이타닉
아바타
매트릭스
반지의 제왕
어벤져스
겨울왕국
트랜스포머
해리 포터
다크나이트
캐리비안의 해적
컬럼B
링크
https://movie.naver.com/movie/bi/mi/basic.nhn?code=18847
https://movie.naver.com/movie/bi/mi/basic.nhn?code=62266
https://movie.naver.com/movie/bi/mi/basic.nhn?code=24452
https://movie.naver.com/movie/bi/mi/basic.nhn?code=31794
https://movie.naver.com/movie/bi/mi/basic.nhn?code=72363
https://movie.naver.com/movie/bi/mi/basic.nhn?code=100931
https://movie.naver.com/movie/bi/mi/basic.nhn?code=61521
https://movie.naver.com/movie/bi/mi/basic.nhn?code=30688
https://movie.naver.com/movie/bi/mi/basic.nhn?code=62586
https://movie.naver.com/movie/bi/mi/basic.nhn?code=37148
//index.js
const xlsx = require('xlsx');
const puppeteer = require('puppeteer');
const add_to_sheet = require('./add_to_sheet');
const workbook = xlsx.readFile('xlsx/data.xlsx');
const ws = workbook.Sheets.영화목록;
const records = xlsx.utils.sheet_to_json(ws);
const crawler = async () => {
try {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.0 Safari/537.36');
add_to_sheet(ws, 'C1', 's', '평점'); //ws에 평점 제목 넣기
for (const [i, r] of records.entries()) {
await page.goto(r.링크); //csv가 아니라 r에서 링크 배열접근
const text = await page.evaluate(() => {
const score = document.querySelector('.score.score_left .star_score');
if (score) {
return score.textContent;
}
});
if (text) {
console.log(r.제목, '평점', text.trim()); //r에서 제목 접근
const newCell = 'C' + (i + 2); //C2부터
add_to_sheet(ws, newCell, 'n', text.trim()); //하나씩 입력
}
await page.waitFor(1000);
}
await page.close();
await browser.close();
xlsx.writeFile(workbook, 'xlsx/result.xlsx'); //엑셀에 입력
} catch (e) {
console.error(e);
}
}
crawler();
const records = xlsx.utils.sheet_to_json(ws);
for (const [i, r] of records.entries()) {
//await page.goto(r[0]);
await page.goto(r.링크); //csv가 아니라 r에서 링크 배열접근해야한다
//const newCell = 'c' + (i + 2);
const newCell = 'C' + (i + 2);
엑셀의 C는 대문자로 해서 해결