반응형
목표
- 영화 리뷰를 입력받아 긍정(1)/부정(0)의 감정을 예측
1. 라이브러리 설치 및 불러오기
!pip install transformers
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
import time
import datetime
2. BERT 관련 라이브러리 불러오기
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
BertTokenizer : 문장을 숫자로 변환
BertForSequenceClassification : BERT 모델
AdamW : 최적화 알고리즘
DataLoader : 데이터를 배치 단위로 나누는 도구
3. 데이터 다운로드 및 로드
!git clone https://github.com/e9t/nsmc.git
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')
4. 데이터 전처리(BERT 형식 변환)
- 문장에 [CLS], [SEP] 토큰 추가
sentences = train['document'].fillna(" ").tolist()
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = train['label'].values
[CLS] 문장의 시작
[SEP] 문장의 끝
5. BERT 토크나이저로 문장을 숫자로 변환
MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
BERT는 문장을 숫자로 이해하기 때문에 숫자로 변환해야 함
✔ tokenizer.tokenize(sent) → 문장을 토큰 단위로 변환
✔ tokenizer.convert_tokens_to_ids(x) → 토큰을 숫자로 변환
✔ pad_sequences() → 모든 문장을 동일한 길이(128)로 맞춤 (패딩)
6. 어텐션 마스크 생성
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]
✔ 패딩된 부분(0)은 무시하고, 실제 단어가 있는 부분만 BERT가 학습하도록 설정
7. 훈련 및 검증 데이터 분할
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1)
8. 데이터를 PyTorch 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
9. DataLoader 생성(배치 단위로 데이터 나누기)
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
size를 32로 묶어서 학습
10. BERT 모델 생성 및 설정
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU 사용:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CPU 사용")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()
11. 옵티마이저 및 학습률 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
12. 모델 학습
from tqdm import tqdm
for epoch_i in range(epochs):
print(f"\n======== Epoch {epoch_i+1} / {epochs} ========")
print("Training...")
model.train()
for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
batch = tuple(t.to("cuda") for t in batch)
b_input_ids, b_input_mask, b_labels = batch
outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
optimizer.step()
scheduler.step()
13. 모델 저장
torch.save(model, "saved_model.h5")
14. 저장된 모델 불러오기
def load_model():
model = torch.load("saved_model.h5")
model.cuda()
return model
model = load_model()
15. 새로운 문장 감정 문석(예측)
def test_sentences(sentences):
model.eval()
inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=128, padding='max_length', truncation=True) for sent in sentences]
input_ids = torch.tensor(inputs).to("cuda")
with torch.no_grad():
outputs = model(input_ids)
logits = outputs[0].detach().cpu().numpy()
return logits
logits = test_sentences(["연기는 별로지만 재미 하나는 끝내줌!"])
print(np.argmax(logits))
최종코드
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
import time
import datetime
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')
sentences = train['document'].fillna(" ").tolist()
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = train['label'].values
MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1)
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU 사용:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CPU 사용")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
from tqdm import tqdm
for epoch_i in range(epochs):
print(f"\n======== Epoch {epoch_i+1} / {epochs} ========")
print("Training...")
model.train()
for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
batch = tuple(t.to("cuda") for t in batch)
b_input_ids, b_input_mask, b_labels = batch
outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
optimizer.step()
scheduler.step()
torch.save(model, "saved_model.h5")
def load_model():
model = torch.load("saved_model.h5")
model.cuda()
return model
model = load_model()
def test_sentences(sentences):
model.eval()
inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=128, padding='max_length', truncation=True) for sent in sentences]
input_ids = torch.tensor(inputs).to("cuda")
with torch.no_grad():
outputs = model(input_ids)
logits = outputs[0].detach().cpu().numpy()
return logits
logits = test_sentences(["연기는 별로지만 재미 하나는 끝내줌!"])
print(np.argmax(logits))
반응형
'인공지능' 카테고리의 다른 글
Image Data - 콘크리트 이미지 분류 모델(MobileNetV2) (0) | 2025.02.09 |
---|---|
Image Data - 꽃 이미지 분류 모델(MobileNetV2) (0) | 2025.02.09 |
Text Data - 문장 감정분류하기(BERT) (0) | 2025.02.09 |
*빅데이터분석기사 대비! AI 총정리. Tabular, Text, Image (0) | 2025.02.09 |
Tabular Data - 당뇨병에 걸릴 확률 회귀 (0) | 2025.02.09 |