NLP에 대해 알고 싶어서 무작정 책을 따라해 봤다.
NLP : computer understands human languages - syntax(arrangement, structure) & sematic(meaning, words ) analysis through deep learning
# to start
import nltk
nltk.download('punkt') #tokenizer
text processing - Tokenization
- Lemmatization
- Stemming
- Stopword removal
from nltk.tokenize import sent_tokenize
myString = " sentence. . . "
tokenized-sentence = sent_tokenize(myString)
print(tokenized-sentence)
# period로 문장 구분
-word~
-sent
-punkt
-Regexp
-TreebankWord_
from nltk.tokenize import sent_tokenize
myString = "Sent. Sent."
print(myString.split()) #split() 스페이스 기준
from nltk.tokenize import word_tokenize, regexp_tokenize #1 #2
myString = " Sent. Sent. "
print( word_tokenize(myString) )
#1. w/ puntionation mark 구분
#2. customize
ex)
from nltk.tokenize import word_tokenize, regexp_tokenize
myString = " Sent. Sent. "
print( regexp_tokenize(myString, pattern="\|+") ) #\|+ : all words but symbols
from nltk.stem import PorterStemmer
porter = PorterStemmer()
print(porter.stem("cutting")
>> cut
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
print ( lancaster.stem("sleeping") )
>> sleep
from nltk.stem-snowball import SnowballStemmer
snowball = SnowballStemmer("english") #language should be given
print( snowball.stem("driving") )
>> drive
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet') #tool kit에서 다운로드
porter = PorterStemmer()
lemma = WordNetLemmatizer()
print(lemma.lemmatize("drove"))
print(porter.stem("drove"))
>> drive
>> doive
to ignore articels and prepositions
ex)
impot nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
mylist = stopwords.words('english') #language given
paragraph = "Family is not an important thing. It's everything."
postPa = [word for word in paragraph.split() if word not in mylist]
print(postPa)
>>['Family', 'important', 'thing.', "It's", 'everything.']
lower()
upper()
정규표현식 활용
sub : 문자열에서 정규 표현식과 일치하는 부분에 대해서 다른 문자열로 대체
ex)
import re
myString = "i have 10 days."
output = re.sub(r'\d+', '', myString)
print(output)
>> i have days.
string 라이브러리에서 translate funtion
strip() 이용.
TextBlob 사용
pos tag 사용
nltk.pos_tag(tokens)