일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | ||||
4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | 31 |
- rnn구현
- inductive
- 기초머신러닝
- Machine Learning
- inductive transductive
- 푸리에
- 서버 os
- rnn
- 서버로 파일 복사
- 크롤링 주의사항
- transductive
- fourier 변환
- virtual env
- 서버 os확인
- 크롤링할때 중요한것
- server os
- 머신러닝
- dlib 설치
- 푸리에 변환
- fourier transform
- inductive learning
- python2 python3
- python패키지설치
- transductive learning
- python버전 동시 사용
- ssh connection closed
- Fourier
- 로컬에서 서버
- 머신러닝 딥러닝
- 푸리에변환이란
- Today
- Total
우당탕탕 도비의 코딩로그
[RNN] Pytorch로 RNN구현하기 (나라별 Family name 구분하기) 본문
데이터 전처리
🥕 데이터 다운로드 및 압축해제
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip
🥕 Family name에 있는 특수 부호나 문자들을 알파벳으로 변환
from glob import glob
file_list = glob('data/names/*.txt')
n_countries = len(file_list)
print("number of countries: {}".format(n_countries))
import string
import unicodedata
alphs = string.ascii_letters + " .,;'"
n_alphs = len(alphs)
print("number of alphabets:{}".format(n_alphs))
def unicodeToAscii(word):
ascii_word = []
for char in unicodedata.normalize("NFD", word): # base character + combining character
if char in alphs and unicodedata.category(char) != "Mn": # 발음 구별 기호 제거하고 알파벳만
ascii_word.append(char)
return ''.join(ascii_word)
# 파일을 읽어서 이름의 char들을 알파벳으로 변환
def readLines(file):
names = []
lines = open(file, encoding='utf-8').read().strip().split("\n")
for line in lines:
names.append(unicodeToAscii(line))
return names
🥕 Family name 을 one-hot encoding 하기
import os
namesxcountry_dict = {}
countries = []
for file in file_list:
country = os.path.splitext(os.path.basename(file))[0]
countries.append(country)
namesxcountry_dict[country] = readLines(file)
import torch
def alph2idx(char):
return alphs.find(char)
def alph2tensor(char):
tensor = torch.zeros(1, n_alphs)
tensor[0][alph2idx(char)] = 1
return tensor
def word2tensor(word):
tensor = torch.zeros(len(word), 1, n_alphs)
for idx, char in enumerate(word):
tensor[idx][0][alph2idx(char)] = 1
return tensor
RNN 모델 구현과 학습
🥕 RNN 구현
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.input2hidden = nn.Linear(input_dim + hidden_dim, hidden_dim)
self.hidden2output = nn.Linear(hidden_dim, output_dim)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.input2hidden(combined)
hidden = torch.tanh(hidden)
output = self.hidden2output(hidden)
return output, hidden
def initial_hidden(self):
return torch.zeros(1, self.hidden_dim)
🥕 RNN 모델 생성
nn = RNN(len(alphs), 128, n_countries)
🥕 Random하게 Family name 과 해당하는 Country 데이터 가져오는 함수
import random
def extractRandomData():
country = random.choice(countries)
word = random.choice(namesxcountry_dict[country])
country_tensor = torch.tensor([countries.index(country)], dtype=torch.long)
word_tensor = word2tensor(word)
return country, word, country_tensor, word_tensor
for _ in range(5):
country, word, country_tensor, word_tensor = extractRandomData()
print('country =', country, '/ word =', word, '/ country_tensor =', country_tensor)
🥕 train 함수
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters())
def train(country_tensor, word_tensor):
optimizer.zero_grad()
hidden = rnn.initial_hidden()
for i in range(word_tensor.size()[0]):
output, hidden = rnn(word_tensor[i], hidden)
loss = loss_fn(output, country_tensor)
loss.backward()
optimizer.step()
return output, loss.item()
🥕 Training data 10000개를 추출하여 학습진행
# 100,000 개의 training data
import time
import math
iteration = 100000
print_iter = 5000
plot_iter = 1000
cur_loss = 0
losses = []
def output2country(output):
top_val, top_idx = output.topk(1)
country_idx = top_idx[0].item()
return countries[country_idx], country_idx
for i in range(1, iteration+1):
country, word, country_tensor, word_tensor = extractRandomData()
output, loss = train(country_tensor, word_tensor)
cur_loss += loss
if i % print_iter == 0:
pred, pred_idx = output2country(output)
if pred == country:
correct = 'correct!'
else:
correct = 'wrong (%s)' % country
print('iteration: %d loss: %.5f Family name: %s / Predicted name: %s %s' % (i, loss, word, pred, correct))
if i % plot_iter == 0:
losses.append(cur_loss / plot_iter)
cur_loss = 0
🥕 예측 함수
def predict(input_word, n_preds=3):
print(f'input word:{input_word}')
with torch.no_grad():
output = torch.softmax(evaluate(word2tensor(input_word)), 1)
top_v, top_idx = output.topk(n_preds, 1, True)
predictions = []
for i in range(n_preds):
value = top_v[0][i].item()
country_idx = top_idx[0][i].item()
print(f"top-{i} score:{value:.4f}/ country: {countries[country_idx]}")
predictions.append([value, countries[country_idx]])
print("\n")
predict("Kim")
predict("Stone")
predict("Gao")
🥕 각 나라 간의 Family name 유사도 Confusion matrix 로 분석
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
confusion = torch.zeros(n_countries, n_countries)
n_confusion = 10000
def evaluate(word_tensor):
hidden = rnn.initial_hidden()
for i in range(word_tensor.size()[0]):
output, hidden = rnn(word_tensor[i], hidden)
return output
for i in range(n_confusion):
country, word, country_tensor, word_tensor = extractRandomData()
output = evaluate(word_tensor)
pred, pred_idx = output2country(output)
country_idx = countries.index(country)
confusion[country_idx][pred_idx] += 1
for i in range(n_countries):
confusion[i] = confusion[i] / confusion[i].sum()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy(), cmap="coolwarm")
fig.colorbar(cax)
ax.set_xticklabels([''] + countries, rotation=90)
ax.set_yticklabels([''] + countries)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
🥕 예측 함수
def predict(input_word, n_preds=3):
print(f'input word:{input_word}')
with torch.no_grad():
output = torch.softmax(evaluate(word2tensor(input_word)), 1)
top_v, top_idx = output.topk(n_preds, 1, True)
predictions = []
for i in range(n_preds):
value = top_v[0][i].item()
country_idx = top_idx[0][i].item()
print(f"top-{i} score:{value:.4f}/ country: {countries[country_idx]}")
predictions.append([value, countries[country_idx]])
print("\n")
predict("Kim")
predict("Stone")
predict("Gao")
🥕 loss 가 5000 iteration마다 줄어드는 정도 그래프로 표현
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def smooth_curve(points, window_size=10):
return np.convolve(points, np.ones(window_size)/window_size, mode='valid')
smoothed_losses = smooth_curve(losses, window_size=10)
plt.figure(figsize=(10, 5))
plt.plot(losses, label="Raw Loss", alpha=0.3)
plt.plot(range(len(smoothed_losses)), smoothed_losses, label="Smoothed Loss", linewidth=2, color='red') # 부드러운 곡선
plt.title("Loss Trend")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()
GitHub Link: https://github.com/DanbiAubrey/RNN
GitHub - DanbiAubrey/RNN: RNN for Identifying the Country of Origin from a Family Name
RNN for Identifying the Country of Origin from a Family Name - DanbiAubrey/RNN
github.com
아니그니까님의 블로그글 참조: https://dykm.tistory.com/39
[Machine Learning] PyTorch로 RNN(순환 신경망) 구현하기
본 블로그의 Machine Learning 카테고리에서는 주로 이미지를 처리하는 내용을 다루었다.각각의 이미지 데이터는 하나의 독립적인 데이터였다.하지만 이런 패러다임에 맞지 않는 데이터는 어떻게
dykm.tistory.com
'AI > NLP' 카테고리의 다른 글
RNN(Recurrent Neural Networks)란?- 순환 신경망 설명 (2) | 2025.02.21 |
---|