Beomi/KcELECTRA

커스텀 데이터를 이용해 predict하는 방법이 궁금합니다.

Haebuk opened this issue · 1 comments

안녕하세요.
현재 감성분석 task를 진행하기 위해 pretrained 모델을 불러와 nsmc(네이버 영화 리뷰) 데이터로 학습을 시킨 상태입니다.
다른 리뷰 데이터를 넣어서 predict값을 뽑아주고 싶은데 어떻게 하면 될까요?
현재 val acc가 가장 높은 가중치를 best.ckpt로 저장한 상태입니다.


import os
import pandas as pd
import yaml

from pprint import pprint
import wandb
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR, CosineAnnealingWarmRestarts

from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize
from utils.load_yaml import load_yaml

default_config = load_yaml('KcELECTRA_config.yaml')
print(f'default_config: {default_config}')
class Model(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()        
        self.clsfier = AutoModelForSequenceClassification.from_pretrained(default_config['pretrained_model'])
        self.tokenizer = AutoTokenizer.from_pretrained(
            default_config['pretrained_tokenizer']
            if default_config['pretrained_tokenizer']
            else default_config['pretrained_model']
        )

    def forward(self, **kwargs):
        return self.clsfier(**kwargs)

    def step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state='train'):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        loss = loss / len(outputs)

        y_true = []
        y_pred = []
        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']
        
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        self.log(state+'_loss', float(loss), on_epoch=True, prog_bar=True)
        self.log(state+'_acc', acc, on_epoch=True, prog_bar=True)
        self.log(state+'_precision', prec, on_epoch=True, prog_bar=True)
        self.log(state+'_recall', rec, on_epoch=True, prog_bar=True)
        self.log(state+'_f1', f1, on_epoch=True, prog_bar=True)
        print(f'[Epoch {self.trainer.current_epoch} {state.upper()}] Loss: {loss}, Acc: {acc}, Prec: {prec}, Rec: {rec}, F1: {f1}')
        return {'loss': loss}
    
    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        if wandb.config['optimizer'] == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=float(wandb.config['lr']))
        elif wandb.config['optimizer'] == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=float(wandb.config['lr']))
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if wandb.config['lr_scheduler'] == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif wandb.config['lr_scheduler'] == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError('Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

    def read_data(self, path):
        if path.endswith('xlsx'):
            return pd.read_excel(path)
        elif path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('tsv') or path.endswith('txt'):
            return pd.read_csv(path, sep='\t')
        else:
            raise NotImplementedError('Only Excel(xlsx)/Csv/Tsv(txt) are Supported')

    def clean(self, x):
        emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2)
        return x

    def encode(self, x, **kwargs):
        return self.tokenizer.encode(
            self.clean(str(x)),
            padding='max_length',
            max_length=int(wandb.config['max_length']),
            truncation=True,
            **kwargs,
        )

    def preprocess_dataframe(self, df):
        df['document'] = df['document'].map(self.encode)
        return df

    def dataloader(self, path, shuffle=False):
        df = self.read_data(path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['document'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=int(wandb.config['batch_size']) * 1 if not default_config['tpu_cores'] else default_config['tpu_cores'],
            shuffle=shuffle,
            num_workers=os.cpu_count(),
        )

    def train_dataloader(self):
        return self.dataloader(default_config['train_data_path'], shuffle=True)

    def val_dataloader(self):
        return self.dataloader(default_config['val_data_path'], shuffle=False)

model = Model.load_from_checkpoint({best.ckpt dir})

까지 한 상태인데, model에 데이터를 넣어서 예측 값을 뽑는 방법이 궁금합니다.

Beomi commented
model(**model.tokenizer('안녕하세요', return_tensors='pt'))

위와 같은 방식으로 Inference 진행이 가능합니다 :)