인공지능

Text Data - 문장 감정분류하기(BERT)

별은_StarIs_Dev 2025. 2. 9. 17:04
반응형

목적

 - 문장에 대한 감정을 분류하자

 - BERT를 사용한 TEXT 분류

 

1. 필요한 라이브러리 설치 및 불러오기

 - 주요 라이브러리

!pip install transformers sklearn pandas numpy torch keras
#transformers : BERT 모델과 토크나이저 제공
#sklearn : 데이터 전처리 및 모델 평가
#pandas, numpy : 데이터 처리 및 수학 연산
#torch : 딥러닝 모델 학습
#keras.preprocessing.sequence.pad_sequences : 배치 데이터 패딩

 

import pandas as pd
import numpy as np
import torch
import random
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

 

 

 

2. 데이터 로드 및 전처리

 - CSV 파일 불러오기

train = pd.read_csv('./corpus2_train.csv', encoding='UTF-8')
test30 = pd.read_csv('./corpus2_test100.csv', encoding='UTF-8')

 

 - 컬럼 이름 변경

train = train.rename(columns={'감정':'label', '문장':'text'})
test30 = test30.rename(columns={'문장':'text'})

 

 - 감정 라벨을 숫자로 변환

encoder = LabelEncoder()
train['label'] = encoder.fit_transform(train['label'])

 

 - 결측값 제거

train.dropna(subset=['text','label'], inplace=True)
test30.dropna(subset=['text'], inplace=True)

 

 - BERT 입력 형식으로 문장 변환

train_sentences = ["[CLS] " + str(s) + " [SEP]" for s in train['text']]
test30_sentences = ["[CLS] " + str(s) + " [SEP]" for s in test30['text']]

 

 

3. BERT 토크나이저로 데이터 변환

 - BERT 모델 설정

bert_model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=False)

 

 

 - 문장을 BERT 입력 데이터로 변환하는 함수

MAX_LEN = 128
batch_size = 32

def convert_data(sentences, labels, tokenizer, max_len=128):
    tokenized_texts = [tokenizer.tokenize(s) for s in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")

    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    return inputs, masks, labels

 

 

  - 데이터 변환 실행

train_inputs, train_masks, train_labels = convert_data(train_sentences, train['label'].values, tokenizer, MAX_LEN)
test_inputs, test_masks, test_labels = convert_data(test30_sentences, [0] * len(test30_sentences), tokenizer, MAX_LEN)

 

4. 데이터 로더 생성

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

 

5. BERT 모델 초기화 및 학습 설정

 - 모델 불러오기

num_labels = len(encoder.classes_)
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_labels)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU 사용:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CPU 사용")

model.to(device)

 

 - 옵티마이저 및 학습률 스케줄러 설정

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

 

 

6. 모델 학습

from tqdm import tqdm
for epoch_i in range(3):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
        b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

 

7. 모델 저장

torch.save(model, "saved_model.h5")

 

8. 모델 로드 및 예측

 - 저장된 모델 로드

model_loaded = torch.load("saved_model.h5", map_location=device)
model_loaded.to(device)
print("Model loaded from saved_model.h5")

 

def predict_sentences(model, tokenizer, sentences, max_len=128):
    model.eval()
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits.detach().cpu().numpy()
    return np.argmax(logits, axis=1)

 

 - 테스트 데이터에 대해 예측

test30_sentences_only = test30['text'].tolist()
test30_pred_loaded = predict_sentences(model_loaded, tokenizer, test30_sentences_only)
test30['pred_label_loaded'] = encoder.inverse_transform(test30_pred_loaded)

 

 - 예측 결과를 CSV 파일로 저장

test30.to_csv("test30_pred_result.csv", index=False)
print("Test predictions saved to test30_pred_result.csv")

 

 - 새로운 문장 예측 예제

test_sentence = "무릎이 아파서 잠시 걷는 것조차 힘들어졌어. 내가 앞으로 무얼 할 수 있을까?"
pred_loaded = predict_sentences(model_loaded, tokenizer, [test_sentence])
print("\nExample Prediction with Loaded Model:", test_sentence)
print("Predicted Label:", encoder.inverse_transform(pred_loaded)[0])

 

 

 

 

최종코드

import pandas as pd
import numpy as np
import torch
import random
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train = pd.read_csv('./data_folder/corpus2_train.csv', encoding='UTF-8')
test30 = pd.read_csv('./data_folder/corpus2_test100.csv', encoding='UTF-8')

train = train.rename(columns={'감정':'label', '문장':'text'})
test30 = test30.rename(columns={'문장':'text'})

encoder = LabelEncoder()
train['label'] = encoder.fit_transform(train['label'])

train.dropna(subset=['text','label'], inplace=True)
test30.dropna(subset=['text'], inplace=True)

train_sentences = ["[CLS] " + str(s) + " [SEP]" for s in train['text']]
test30_sentences = ["[CLS] " + str(s) + " [SEP]" for s in test30['text']]

bert_model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=False)

MAX_LEN = 128
batch_size = 32

def convert_data(sentences, labels, tokenizer, max_len=128):
    tokenized_texts = [tokenizer.tokenize(s) for s in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")

    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    return inputs, masks, labels

train_inputs, train_masks, train_labels = convert_data(train_sentences, train['label'].values, tokenizer, MAX_LEN)
test_inputs, test_masks, test_labels = convert_data(test30_sentences, [0] * len(test30_sentences), tokenizer, MAX_LEN)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

num_labels = len(encoder.classes_)
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_labels)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU 사용:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CPU 사용")

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

from tqdm import tqdm
for epoch_i in range(3):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training Progress")):
        b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

torch.save(model, "saved_model.h5")

model_loaded = torch.load("saved_model.h5", map_location=device)
model_loaded.to(device)
print("Model loaded from saved_model.h5")

def predict_sentences(model, tokenizer, sentences, max_len=128):
    model.eval()
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits.detach().cpu().numpy()
    return np.argmax(logits, axis=1)

test30_sentences_only = test30['text'].tolist()
test30_pred_loaded = predict_sentences(model_loaded, tokenizer, test30_sentences_only)
test30['pred_label_loaded'] = encoder.inverse_transform(test30_pred_loaded)

test30.to_csv("test30_pred_result.csv", index=False)
print("Test predictions saved to test30_pred_result.csv")

test_sentence = "무릎이 아파서 잠시 걷는 것조차 힘들어졌어. 내가 앞으로 무얼 할 수 있을까?"
pred_loaded = predict_sentences(model_loaded, tokenizer, [test_sentence])
print("\nExample Prediction with Loaded Model:", test_sentence)
print("Predicted Label:", encoder.inverse_transform(pred_loaded)[0])
반응형