자연어 숫자로 표현하기 - 정수 인코딩과 패딩(제로 패딩)

MLOps 부트캠프 by 한경+토스뱅크/Deep Learning

자연어 숫자로 표현하기 - 정수 인코딩과 패딩(제로 패딩)

나니니 2024. 10. 3. 21:41

정수 인코딩이란?

전처리된 텍스트 데이터를 컴퓨터가 분석에 활용할 수 있게 하려면 숫자 데이터로 변환해야 한다. 이를 위한 여러 방법이 있는데, 대표적으로 정수 인코딩이 있다. 정수 인코딩은 토큰화된 각 단어에 특정 정수를 맵핑하여 고유 번호로 사용하는 방법이다.

단어 토큰에 정수 인덱스를 부여하는 방법은 다양한데, 그 중 가장 일반적인 방법은 단어의 등장 빈도를 기준으로 정렬한 다음 인덱스를 부여하는 방식이다.

정수 인코딩 하기

정수 인코딩을 하면 더 이상 추가적인 전처리를 할 수 없다. 때문에 모든 전처리 과정이 끝난 코퍼스를 가지고 정수 인코딩을 해야 한다.

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import pandas as pd
# from preprocess import clean_by_freq
# from preprocess import clean_by_len
# from preprocess import clean_by_stopwords
# from preprocess import stemming_by_porter
# from preprocess import pos_tagger
# from preprocess import words_lemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# 등장 빈도 기준 정제 함수
def clean_by_freq(tokenized_words, cut_off_count):
    # 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
    vocab = Counter(tokenized_words)
    
    # 빈도수가 cut_off_count 이하인 단어 set 추출
    uncommon_words = {key for key, value in vocab.items() if value <= cut_off_count}
    
    # uncommon_words에 포함되지 않는 단어 리스트 생성
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words


# 단어 길이 기준 정제 함수
def clean_by_len(tokenized_words, cut_off_length):
    cleaned_by_freq_len = []
    
    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_by_freq_len.append(word)

    return cleaned_by_freq_len
    
    
# 불용어 제거 함수
def clean_by_stopwords(tokenized_words, stop_words_set):
    cleaned_words = []
    
    for word in tokenized_words:
        if word not in stop_words_set:
            cleaned_words.append(word)
            
    return cleaned_words


# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words


# 품사 태깅 함수
def pos_tagger(tokenized_sents):
    pos_tagged_words = []

    for sentence in tokenized_sents:
        # 단어 토큰화
        tokenized_words = word_tokenize(sentence)
    
        # 품사 태깅
        pos_tagged = pos_tag(tokenized_words)
        pos_tagged_words.extend(pos_tagged)
    
    return pos_tagged_words


# Penn Treebank POS Tag를 WordNet POS Tag로 변경
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB


# 표제어 추출 함수
def words_lemmatizer(pos_tagged_words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word, tag in pos_tagged_words:
        wn_tag = penn_to_wn(tag)

        if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)

    return lemmatized_words

def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
        
    return encoded_idx

# 데이터 불러오기
df = pd.read_csv('/data/imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# 하나의 행 정수 인코딩
word_to_idx = {}
i = 0
encoded_idx = []

# 5번째 로우의 데이터를 가지고 정수 인코딩 적용
tokens = df['cleaned_tokens'][4]

vocab = Counter(tokens)
vocab = vocab.most_common() # 단어와 코퍼스 안에서 해당 단어의 등장 빈도가 튜플 혀여태로 매칭되어 빈도수 기준으로 리스트에 저장됨

# 각 단어에 인덱스 부여
for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

# 토큰들을 부여된 인덱스로 변경
for token in tokens:
    idx = word_to_idx[token]
    encoded_idx.append(idx)

# 전체 데이터 프레임 정수 인코딩
	# 데이터프레임에 있는 모든 로우들로 정수 인코딩을 하려면 전체 코퍼스의 토큰들을 전부 합하여 단어의 등장 빈도를 계산해야 한다. 
    	# 그래야 정수 인덱스가 각 코퍼스에 포함된 토큰들마다 동일하게 부여될 수 있다. 

word_to_idx = {}
i = 0
tokens = sum(df['cleaned_tokens'], []) # sum()함수에 모든 코퍼스들과 []를 파라미터로 넘ㄱ주면 하나의 합쳐진 토큰 리스트가 반환됨

vocab = Counter(tokens)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

df['integer_encoded'] = df['cleaned_tokens'].apply(lambda x: idx_encoder(x, word_to_idx))

df[['integer_encoded']]

예시 - 정수 인코딩

코퍼스에 포함된 단어 토큰의 등장 빈도 기준으로 정수 인덱스를 생성하고, 모든 토큰들을 정수 인덱스로 변환해 주세요.

주어진 코퍼스의 단어별 등장 빈도를 계산하고, 많이 등장한 단어순으로 정렬해 주세요.
빈도수 기준으로 정렬한 단어들에 인덱스를 부여해 주세요.
부여된 정수 인덱스를 기준으로 모든 토큰들을 정수 인코딩해 주세요.

# 필요한 패키지와 함수 불러오기
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
# from text import TEXT
nltk.download('punkt')

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

word_to_idx = {} # 단어 별 인덱스 부여하기 위한 딕셔너리
i = 0
encoded_idx = [] # 각 토큰의 정수 인덱스를 부여하기 위한 리스트
corpus = TEXT

tokenized_words = word_tokenize(corpus)

# 단어의 빈도수를 계산하여 정렬하는 코드를 작성하세요
vocab = Counter(tokenized_words)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

for word in tokenized_words:
    idx = word_to_idx[word]
    encoded_idx.append(idx)

# 테스트 코드
encoded_idx

패딩(Padding)

패딩(Padding)이란?

예를 들어서, 아래와 같은 문장들이 있다.

I love you
You are so lovely
I am loving you now

각 문장을 전처리 한 후에, 단어들을 등장 빈도로 정수 인코딩을 하게 되면 어떻게 될까?
각 문장들은 길이와 토큰의 수가 다 다르기 때문에 인코딩된 정수의 개수도 서로 달라진다. 'I love you'는 3개, 'You are so lovely'는 4개, 'I am loving you now'에는 5개의 정수 인덱스가 포함된다.

그런데 만약 각 문장의 토큰 수가 모두 같다면 정수 인코딩 결과는 하나의 행렬로 만들 수 있을 것이다. 위의 예시와 다르게 3개의 문장 전부 단어 토큰 수가 5개였다면 3*5 행렬이 만들어진다.

그리고 데이터를 행렬로 만들면, 컴퓨터는 많은 데이터를 한 번에 묶어서 처리할 수 있다. 그러면 좀 더 복잡한 연산을 쉽게 처리하는게 가능해진다. 때문에 각기 다른 문장의 길이를 서로 맞춰서 행렬 형태로 만드는게 필요한데 그 과정을 패딩이라고 한다.

제로 패딩(Zero Padding)

패딩을 하는 방법은 여러가지인데, 대표적으로 제로 패딩이 있다.

제로 패딩은 정수 인코딩을 한 문장 중 가장 긴 문장의 길이를 구하고, 길이가 짧은 문장에 숫자 0을 채워서 가장 긴 문장과 길이를 맞추는 방법이다.
앞에서 정수 인코딩 작업을 할 때 빈도수에 따라 정수 인덱스를 1부터 부여했다. 때문에 정수 0은 아무 의미를 갖고 있지 않고, 컴퓨터는 해당 인덱스를 무시하게 된다.

앞에서 정수 인코딩을 했던 df['integer_encoded']를 활용해 제로 패딩을 한번 해보자.

예시 - 제로 패딩

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from collections import Counter
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')


# 등장 빈도 기준 정제 함수
def clean_by_freq(tokenized_words, cut_off_count):
    # 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
    vocab = Counter(tokenized_words)
    
    # 빈도수가 cut_off_count 이하인 단어 set 추출
    uncommon_words = {key for key, value in vocab.items() if value <= cut_off_count}
    
    # uncommon_words에 포함되지 않는 단어 리스트 생성
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words


# 단어 길이 기준 정제 함수
def clean_by_len(tokenized_words, cut_off_length):
    cleaned_by_freq_len = []
    
    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_by_freq_len.append(word)

    return cleaned_by_freq_len
    
    
# 불용어 제거 함수
def clean_by_stopwords(tokenized_words, stop_words_set):
    cleaned_words = []
    
    for word in tokenized_words:
        if word not in stop_words_set:
            cleaned_words.append(word)
            
    return cleaned_words


# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words


# 품사 태깅 함수
def pos_tagger(tokenized_sents):
    pos_tagged_words = []

    for sentence in tokenized_sents:
        # 단어 토큰화
        tokenized_words = word_tokenize(sentence)
    
        # 품사 태깅
        pos_tagged = pos_tag(tokenized_words)
        pos_tagged_words.extend(pos_tagged)
    
    return pos_tagged_words


# Penn Treebank POS Tag를 WordNet POS Tag로 변경
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB


# 표제어 추출 함수
def words_lemmatizer(pos_tagged_words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word, tag in pos_tagged_words:
        wn_tag = penn_to_wn(tag)

        if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)

    return lemmatized_words


def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
        
    return encoded_idx

# 데이터 불러오기
df = pd.read_csv('/data/imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# 하나의 행 정수 인코딩
word_to_idx = {}
i = 0
encoded_idx = []

tokens = df['cleaned_tokens'][4]

vocab = Counter(tokens)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

for token in tokens:
    idx = word_to_idx[token]
    encoded_idx.append(idx)

# 전체 데이터 프레임 정수 인코딩
word_to_idx = {}
i = 0
tokens = sum(df['cleaned_tokens'], [])

vocab = Counter(tokens)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    i = i + 1
    word_to_idx[word] = i

df['integer_encoded'] = df['cleaned_tokens'].apply(lambda x: idx_encoder(x, word_to_idx))

# 패딩
max_len = max(len(item) for item in df['integer_encoded'])

for tokens in df['integer_encoded']:
    while len(tokens) < max_len:
        tokens.append(0)

df[['integer_encoded']]

현재글자연어 숫자로 표현하기 - 정수 인코딩과 패딩(제로 패딩)

데이터 개발 공부