[Preprocess] 문장 전처리

Hyun·2023년 2월 22일

Python

목록 보기

3/3

문장 전처리

html Tag 확인
HTML Tag 삭제
이모지 삭제
반복 문자 삭제

HTML Tag 확인

import re
# html tag 찾기 ==> 직접 고치기로 함
set_list = set([])
for v in df['내용'].values :
  matched = re.match('<[^가-힣]*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', v)
  if matched != None :
    matched = matched.group()
    set_list.add(matched)

{'뺴', '꺆', '썜', '🤔', '앜', '뭥', '🌹', '🌆', '👍', '𐤟', '🥳', '읭', '핰', '✿', '', '💕', '🤳', '👨', '쬬', '🎉', '👏', '✍', '읎', '흫', '헿', '🧡', '😊', '💙', '⚠', '쫜', 'ヾ', '꺜', '😉', '😄', '굥', '😃',

'>', ' ',

'<h2 style="font-style:italic">', '"', '<', ''', '<img alt="wink" src="https://tzone...png" style="height:23px; width:23px" title="wink" />',

'<br/>', '<h2>', '<3>', '<div>', '<s>', '<p>', '<br />', '<strong>', '<table>',

'<span style="font-size:16px">', '<span style="font-size:14px">', '<span style="display:none">', '<span style="font-size:10pt">', '<span style="font-size:12px">', '<span style="font-size:11px">'} '<span style="font-size:10px">', '<div style="margin-left:40px">', '<span style="color:rgb(102, 102, 102); font-family:dotum,verdana; font-size:12px; line-height:18px">', '<span style="color:rgb(0, 0, 0); font-family:malgun gothic,malgungothic,nanumgothic,helvetica neue,helvetica,arial,sans-serif; font-size:10.6667px">'

'<p class="0" style="mso-padding-alt:0.0pt 0.0pt 0.0pt 0.0pt; mso-pagination:none; text-autospace:none">'
'',

HTML Tag 삭제

import requests
from bs4 import BeautifulSoup

df['내용'] = df['내용'].apply(lambda x : BeautifulSoup(x,"html5lib").get_text())

이모지 삭제

이모지 범위가 더 늘어났을 수도 있음 유니코드 형태인 다른 문자 발견

only_BMP_pattern = re.compile("["u"\U00010000-\U0010FFFF"  #BMP characters 이외
	"]+", flags=re.UNICODE)
text = only_BMP_pattern.sub(r'', text) # no emoji

반복 문자 삭제

def continue_string(self, s) :
  bef_string = ''
  cnt, i = 1, 0
  dic = {}
  while i < len(s) :
    if bef_string == s[i] :
      cnt += 1
    else :
      bef_string = s[i]
      cnt = 1
    if cnt > 2 :
      dic[bef_string] = cnt
    i += 1
  return dic


def replace_continue_string(self, s) :
  dic = self.continue_string(s)
  for key in dic :
    s = s.replace(key*dic[key], key*2)
  return s

df = df.apply(lambda x : replace_continue_string(x))    # 반복 문자 없애기

Hyun

이전 포스트