우당탕탕 도비의 코딩로그

[RNN] Pytorch로 RNN구현하기 (나라별 Family name 구분하기) 본문

AI/NLP

[RNN] Pytorch로 RNN구현하기 (나라별 Family name 구분하기)

dobbie 2025. 2. 27. 15:47
반응형

데이터 전처리

 

🥕 데이터 다운로드 및 압축해제

!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

 

🥕 Family name에 있는 특수 부호나 문자들을 알파벳으로 변환

from glob import glob

file_list = glob('data/names/*.txt')
n_countries = len(file_list)
print("number of countries: {}".format(n_countries))
import string
import unicodedata

alphs = string.ascii_letters + " .,;'"
n_alphs = len(alphs)
print("number of alphabets:{}".format(n_alphs))
def unicodeToAscii(word):
  ascii_word = []
  for char in unicodedata.normalize("NFD", word): # base character + combining character
    if char in alphs and unicodedata.category(char) != "Mn": # 발음 구별 기호 제거하고 알파벳만
      ascii_word.append(char)

  return ''.join(ascii_word)
# 파일을 읽어서 이름의 char들을 알파벳으로 변환
def readLines(file):
  names = []
  lines = open(file, encoding='utf-8').read().strip().split("\n")
  for line in lines:
    names.append(unicodeToAscii(line))
  return names

 

🥕 Family name 을 one-hot encoding 하기

import os

namesxcountry_dict = {}
countries = []

for file in file_list:
  country = os.path.splitext(os.path.basename(file))[0]
  countries.append(country)
  namesxcountry_dict[country] = readLines(file)
import torch

def alph2idx(char):
    return alphs.find(char)

def alph2tensor(char):
    tensor = torch.zeros(1, n_alphs)
    tensor[0][alph2idx(char)] = 1

    return tensor

def word2tensor(word):
    tensor = torch.zeros(len(word), 1, n_alphs)
    for idx, char in enumerate(word):
        tensor[idx][0][alph2idx(char)] = 1
    return tensor

 

RNN 모델 구현과 학습

🥕 RNN 구현

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input2hidden = nn.Linear(input_dim + hidden_dim, hidden_dim)
        self.hidden2output = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.input2hidden(combined)
        hidden = torch.tanh(hidden)
        output = self.hidden2output(hidden)

        return output, hidden

    def initial_hidden(self):
        return torch.zeros(1, self.hidden_dim)

 

🥕 RNN 모델 생성

nn = RNN(len(alphs), 128, n_countries)

 

🥕 Random하게  Family name 과 해당하는 Country 데이터 가져오는 함수

import random

def extractRandomData():
    country = random.choice(countries)
    word = random.choice(namesxcountry_dict[country])
    country_tensor = torch.tensor([countries.index(country)], dtype=torch.long)
    word_tensor = word2tensor(word)

    return country, word, country_tensor, word_tensor

for _ in range(5):
    country, word, country_tensor, word_tensor = extractRandomData()
    print('country =', country, '/ word =', word, '/ country_tensor =', country_tensor)

결과 출력문

 

🥕 train 함수

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters())

def train(country_tensor, word_tensor):
    optimizer.zero_grad()
    hidden = rnn.initial_hidden()

    for i in range(word_tensor.size()[0]):
        output, hidden = rnn(word_tensor[i], hidden)

    loss = loss_fn(output, country_tensor)

    loss.backward()

    optimizer.step()

    return output, loss.item()

 

🥕 Training data 10000개를 추출하여 학습진행

# 100,000 개의 training data
import time
import math

iteration = 100000
print_iter = 5000
plot_iter = 1000

cur_loss = 0
losses = []

def output2country(output):
    top_val, top_idx = output.topk(1)
    country_idx = top_idx[0].item()
    return countries[country_idx], country_idx

for i in range(1, iteration+1):
    country, word, country_tensor, word_tensor = extractRandomData()
    output, loss = train(country_tensor, word_tensor)
    cur_loss += loss

    if i % print_iter == 0:
        pred, pred_idx = output2country(output)
        if pred == country:
            correct = 'correct!'
        else:
            correct = 'wrong (%s)' % country
        print('iteration: %d loss: %.5f Family name: %s / Predicted name: %s %s' % (i, loss, word, pred, correct))

    if i % plot_iter == 0:
        losses.append(cur_loss / plot_iter)
        cur_loss = 0

 

5000 iteration마다 찍은 출력문 결과

 

🥕 예측 함수

def predict(input_word, n_preds=3):
  print(f'input word:{input_word}')

  with torch.no_grad():
    output = torch.softmax(evaluate(word2tensor(input_word)), 1)

    top_v, top_idx = output.topk(n_preds, 1, True)
    predictions = []

    for i in range(n_preds):
      value = top_v[0][i].item()
      country_idx = top_idx[0][i].item()
      print(f"top-{i} score:{value:.4f}/ country: {countries[country_idx]}")
      predictions.append([value, countries[country_idx]])
  print("\n")

predict("Kim")
predict("Stone")
predict("Gao")

랜덤하게 뽑은 3개의 데이터로 각 top-3의 country 출력

 

🥕 각 나라 간의 Family name 유사도 Confusion matrix 로 분석

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

confusion = torch.zeros(n_countries, n_countries)
n_confusion = 10000

def evaluate(word_tensor):
  hidden = rnn.initial_hidden()

  for i in range(word_tensor.size()[0]):
    output, hidden = rnn(word_tensor[i], hidden)

  return output

for i in range(n_confusion):
  country, word, country_tensor, word_tensor = extractRandomData()
  output = evaluate(word_tensor)
  pred, pred_idx = output2country(output)
  country_idx = countries.index(country)
  confusion[country_idx][pred_idx] += 1


for i in range(n_countries):
  confusion[i] = confusion[i] / confusion[i].sum()

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy(), cmap="coolwarm")
fig.colorbar(cax)

ax.set_xticklabels([''] + countries, rotation=90)
ax.set_yticklabels([''] + countries)

ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

plt.show()

 

confusion matrix

 

🥕 예측 함수

def predict(input_word, n_preds=3):
  print(f'input word:{input_word}')

  with torch.no_grad():
    output = torch.softmax(evaluate(word2tensor(input_word)), 1)

    top_v, top_idx = output.topk(n_preds, 1, True)
    predictions = []

    for i in range(n_preds):
      value = top_v[0][i].item()
      country_idx = top_idx[0][i].item()
      print(f"top-{i} score:{value:.4f}/ country: {countries[country_idx]}")
      predictions.append([value, countries[country_idx]])
  print("\n")

predict("Kim")
predict("Stone")
predict("Gao")

Family name 별 top-3 나라

 

🥕 loss 가 5000 iteration마다 줄어드는 정도 그래프로 표현

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def smooth_curve(points, window_size=10):
    return np.convolve(points, np.ones(window_size)/window_size, mode='valid')

smoothed_losses = smooth_curve(losses, window_size=10)

plt.figure(figsize=(10, 5))
plt.plot(losses, label="Raw Loss", alpha=0.3)
plt.plot(range(len(smoothed_losses)), smoothed_losses, label="Smoothed Loss", linewidth=2, color='red')  # 부드러운 곡선

plt.title("Loss Trend")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.grid(True, linestyle="--", alpha=0.6)

plt.show()

 

 

 

 

 

GitHub Link: https://github.com/DanbiAubrey/RNN

 

GitHub - DanbiAubrey/RNN: RNN for Identifying the Country of Origin from a Family Name

RNN for Identifying the Country of Origin from a Family Name - DanbiAubrey/RNN

github.com

 

아니그니까님의 블로그글 참조: https://dykm.tistory.com/39

 

[Machine Learning] PyTorch로 RNN(순환 신경망) 구현하기

본 블로그의 Machine Learning 카테고리에서는 주로 이미지를 처리하는 내용을 다루었다.각각의 이미지 데이터는 하나의 독립적인 데이터였다.하지만 이런 패러다임에 맞지 않는 데이터는 어떻게

dykm.tistory.com

 

반응형

'AI > NLP' 카테고리의 다른 글

RNN(Recurrent Neural Networks)란?- 순환 신경망 설명  (2) 2025.02.21
Comments