1편에 이어서 다른 방법의 앱스토어 리뷰도 찾아보고 적용해 보았다.
두 번째로, RSS Feed를 통해 앱스토어 리뷰 크롤링이 가능했다.
import pandas as pd
import xmltodict
import requests
import os
def get_url_index(url):
response = requests.get(url).content.decode('utf8')
xml = xmltodict.parse(response)
last_url = [l['@href'] for l in xml['feed']['link'] if (l['@rel'] == 'last')][0]
last_index = [int(s.replace('page=', '')) for s in last_url.split('/') if ('page=' in s)][0]
return last_index
def appstore_crawler(appid, outfile='./appstore_reviews.csv'):
url = 'https://itunes.apple.com/kr/rss/customerreviews/page=1/id=%i/sortby=mostrecent/xml' % appid
try:
last_index = get_url_index(url)
except Exception as e:
print (url)
print ('\tNo Reviews: appid %i' %appid)
print ('\tException:', e)
return
result = list()
for idx in range(1, 101):
url = "https://itunes.apple.com/kr/rss/customerreviews/page=%i/id=%i/sortby=mostrecent/xml?urlDesc=/customerreviews/id=%i/sortBy=mostRecent/xml" % (idx, appid, appid)
print(url)
response = requests.get(url).content.decode('utf8')
try:
xml = xmltodict.parse(response)
except Exception as e:
print ('\tXml Parse Error %s\n\tSkip %s :' %(e, url))
continue
try:
num_reivews= len(xml['feed']['entry'])
except Exception as e:
print ('\tNo Entry', e)
continue
try:
xml['feed']['entry'][0]['author']['name']
single_reviews = False
except:
#print ('\tOnly 1 review!!!')
single_reviews = True
pass
if single_reviews:
result.append({
'USER': xml['feed']['entry']['author']['name'],
'DATE': xml['feed']['entry']['updated'],
'STAR': int(xml['feed']['entry']['im:rating']),
'LIKE': int(xml['feed']['entry']['im:voteSum']),
'TITLE': xml['feed']['entry']['title'],
'REVIEW': xml['feed']['entry']['content'][0]['#text'],
})
else:
for i in range(len(xml['feed']['entry'])):
result.append({
'USER': xml['feed']['entry'][i]['author']['name'],
'DATE': xml['feed']['entry'][i]['updated'],
'STAR': int(xml['feed']['entry'][i]['im:rating']),
'LIKE': int(xml['feed']['entry'][i]['im:voteSum']),
'TITLE': xml['feed']['entry'][i]['title'],
'REVIEW': xml['feed']['entry'][i]['content'][0]['#text'],
})
res_df = pd.DataFrame(result)
res_df['DATE'] = pd.to_datetime(res_df['DATE'], format="%Y-%m-%dT%H:%M:%S")
res_df.to_csv(outfile, encoding='utf-8-sig', index=False)
print ('Save reviews to file: %s \n' %(outfile))
if __name__ == '__main__':
# 배민 app_id 378084485
# 요기요 app_id 543831532
app_id = 78084485
outfile = os.path.join('appstore_' + str(app_id)+'.csv')
appstore_crawler(app_id, outfile=outfile)