[NLP] Word Embedding : Bag of words / TFIDF / Word2Vec / fastText

자연어처리, NLP, Embedding, Bag of words, TFIDF, Word2Vec, fastText

Featured image

[NLP] Word Embedding : Bag of words / TFIDF / Word2Vec / fastText

1. Bag of words




#### Bag of Words ####

corpus = [
    '학교에 가서 수업을 들었다. 학교에 간건 오랜만이다.',
    '학교에 가서 친구 얘기를 들었다.',
    '내일 가서 뭐 먹지?'
]

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

vect.transform(corpus).toarray()

vect.transform(['수업을 들었다. 수업은 재미있다.']).toarray()



2. TFIDF (Term Frrequency-Inverse Document Frequency)


#### TFIDF ####

from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()



3. Word2Vec



#### Word2Vec ####

# make datasets (.txt file)
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

import pandas as pd

df = pd.read_csv('ratings_train.txt', sep='\t')
doc = list(df['document'])

with open('ratings_train_text_only.txt', 'w') as f:
  for text in doc:
    f.write(str(text) + '\n')

# read text file
with open('ratings_train_text_only.txt', 'r') as f:
  texts = [str(text).replace('\n', '') for text in doc if len(str(text)) >= 10]
  
# word2vec training

import os
from gensim.models import Word2Vec

def word2vec(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    print('word2vec training...')
    model = Word2Vec(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims

    model.save('word2vec')

word2vec(texts)

w2v = Word2Vec.load('word2vec')

# 단어 벡터
w2v.wv['감동']

# 유사 단어
w2v.wv.most_similar('사랑')



4. fastText



#### fastText ####

# fasttext training

import os
from gensim.models import FastText

def fasttext(texts):
    inputs = [tt.split(' ') for tt in texts]
    print('number of text = ', len(inputs))

    model = FastText(inputs, size=50, window=3, min_count=3, negative=5, workers=os.cpu_count(), iter=10, sg=1)
    model.init_sims()

    model.save('fasttext')
    print('fasttext is trained')

fasttext(texts)

w2v = Word2Vec.load('word2vec')
fasttext = FastText.load('fasttext')

fasttext.wv.most_similar('고능학교')


참고자료

  • https://wikidocs.net/22650
  • https://wikidocs.net/31698