인공지능

Text Data - 네이버 영화리뷰 감정 분석하기(BERT)

별은_StarIs_Dev 2025. 2. 9. 18:36
반응형

목표

 - 영화 리뷰를 입력받아 긍정(1)/부정(0)의 감정을 예측

 

1. 라이브러리 설치 및 불러오기

!pip install transformers
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
import time
import datetime

 

2. BERT 관련 라이브러리 불러오기

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

BertTokenizer : 문장을 숫자로 변환

BertForSequenceClassification : BERT 모델

AdamW : 최적화 알고리즘

DataLoader : 데이터를 배치 단위로 나누는 도구

 

3. 데이터 다운로드 및 로드

!git clone https://github.com/e9t/nsmc.git
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')

 

4. 데이터 전처리(BERT 형식 변환)

 - 문장에 [CLS], [SEP] 토큰 추가

sentences = train['document'].fillna(" ").tolist()
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = train['label'].values

[CLS] 문장의 시작

[SEP] 문장의 끝

 

5. BERT 토크나이저로 문장을 숫자로 변환

MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

BERT는 문장을 숫자로 이해하기 때문에 숫자로 변환해야 함

✔ tokenizer.tokenize(sent) → 문장을 토큰 단위로 변환
✔ tokenizer.convert_tokens_to_ids(x) → 토큰을 숫자로 변환
✔ pad_sequences() → 모든 문장을 동일한 길이(128)로 맞춤 (패딩)

 

6. 어텐션 마스크 생성

attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

패딩된 부분(0)은 무시하고, 실제 단어가 있는 부분만 BERT가 학습하도록 설정

 

7. 훈련 및 검증 데이터 분할

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1)

 

8. 데이터를 PyTorch 텐서로 변환

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

 

9. DataLoader 생성(배치 단위로 데이터 나누기)

batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

size를 32로 묶어서 학습

 

10. BERT 모델 생성 및 설정

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU 사용:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CPU 사용")

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

 

11. 옵티마이저 및 학습률 스케줄러 설정

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

 

12. 모델 학습

from tqdm import tqdm

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i+1} / {epochs} ========")
    print("Training...")
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
        batch = tuple(t.to("cuda") for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

 

13. 모델 저장

torch.save(model, "saved_model.h5")

 

14. 저장된 모델 불러오기

def load_model():
    model = torch.load("saved_model.h5")
    model.cuda()
    return model

model = load_model()

 

15. 새로운 문장 감정 문석(예측)

def test_sentences(sentences):
    model.eval()
    inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=128, padding='max_length', truncation=True) for sent in sentences]
    input_ids = torch.tensor(inputs).to("cuda")
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs[0].detach().cpu().numpy()
    return logits

logits = test_sentences(["연기는 별로지만 재미 하나는 끝내줌!"])
print(np.argmax(logits))

 

 

 

 

최종코드

 

import tensorflow as tf
import torch
import pandas as pd
import numpy as np
import random
import time
import datetime

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')

sentences = train['document'].fillna(" ").tolist()
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = train['label'].values

MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU 사용:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CPU 사용")

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

from tqdm import tqdm

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i+1} / {epochs} ========")
    print("Training...")
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
        batch = tuple(t.to("cuda") for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

torch.save(model, "saved_model.h5")

def load_model():
    model = torch.load("saved_model.h5")
    model.cuda()
    return model

model = load_model()

def test_sentences(sentences):
    model.eval()
    inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=128, padding='max_length', truncation=True) for sent in sentences]
    input_ids = torch.tensor(inputs).to("cuda")
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs[0].detach().cpu().numpy()
    return logits

logits = test_sentences(["연기는 별로지만 재미 하나는 끝내줌!"])
print(np.argmax(logits))
반응형