[Dacon] 소설 작가 분류 AI 경진대회 정리[01.19 ~ 01.20]

개인 프로젝트/DACON

[Dacon] 소설 작가 분류 AI 경진대회 정리[01.19 ~ 01.20]

Jerry Jun 2021. 1. 21. 13:20

대회명 : 소설 작가 분류 AI 경진대회
기 간 : 2020.10.29 ~ 2020.12.04
주 제 : 문체 분석 알고리즘 개발
설 명 : 소설 속 문장뭉치 분석을 통한 저자 예측
주 관 : DACON

EDA

본 EDA 과정은 Google Colaboratory 를 사용하였습니다.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

path = '/content/drive/MyDrive/dacon/author/'
train = pd.read_csv(path + 'train.csv', index_col=0)
test = pd.read_csv(path + 'test_x.csv', index_col=0)

필요한 모듈 선언 및 데이터 불러오기

# train data 와 test data 의 크기 확인
train.shape, test.shape
---------------------------------
((54879, 2), (19617, 1))

불러온 데이터의 크기 확인

import nltk
nltk.download('stopwords')

stopwords 불러오기

def plot_top_stopwords_barchart(text):
    stopwords = set(stopwords.words('english'))
    
    new = text.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]
    dic = defaultdict(int)
    for word in corpus:
        if word in stopwords:
            dic[word]+=1
            
    top = sorted(dic.items(), key = lambda x : x[1], reverse = True)[:10] 
    x, y = zip(*top)
    plt.bar(x,y)
    
    return x,y

불용어 중 가장 많은 빈도수를 가지는 상위 10개의 단어를 찾아내 막대그래프로 표현하는 함수 생성.

잠깐! 여기서 쓰이는 함수 zip의 기능은 무엇일까?

zip 은 같은 위치에 있는 아이템별로 그룹핑하여 iterator 형태로 반환해주는 함수입니다.

이 함수에서는 "*" 기호를 썼는데, 이는 unzip 의 기능을 합니다.

plot_top_stopwords_barchart(train['text'])

보는 바와 같이 train 데이터에는 'the' > 'and' > 'to' 순으로 불용어가 많다는 것을 보여주고 있습니다.

불용어는 뭔가 모두 삭제해줘야 할 것 같은 느낌이지만 작가마다 자주 쓰는 불용어가 있을 수 있습니다. 각각 작가마다 불용어를 쓰는 빈도를 알아보겠습니다.

def plot_top_stopwords_barchart2(text):
    stop = set(stopwords.words('english'))
    
    new = text.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]
    dic = defaultdict(int)
    for word in corpus:
        if word in stop:
            dic[word]+=1
            
    top = sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    x, y = zip(*top)
    return x, y

fig = plt.figure(figsize=(20,10)) # rows*cols 행렬의 i번째 subplot 생성
rows = 2
cols = 3
i = 1

for i in range(5):
    x, y = plot_top_stopwords_barchart2(train[train['author']==i]['text'])
    ax = fig.add_subplot(rows, cols, i+1)
    ax.set_title("author"+str(i))
    ax.bar(x,y)

0 : the > and > to > of > a > in > was > that > his > he
1 : the > to > of > and > a > was > in > her > not > be
2 : the > of > and > to > a > in > that > was > his > he
3 : the > and > to > of > a > in > he > you > that > was
4 : the > and > of > a > to > in > was > he > his > with

작가마다 불용어는 대부분 비슷한 것을 알 수 있다.

from collections import  Counter

fig = plt.figure(figsize=(17,8))
rows = 2
cols = 3

def plot_top_non_stopwords_barchart2(text):
    stop=set(stopwords.words('english'))
    
    new = text.str.split()
    new = new.values.tolist()
    corpus = [word for i in new for word in i]

    counter = Counter(corpus)
    most = counter.most_common()
    x, y = [], []
    for word, count in most[:40]:
        if (word not in stop):
            x.append(word)
            y.append(count)
    return x, y

for i in range(5):
    x,y = plot_top_non_stopwords_barchart2(train[train['author']==i]['text'])
    ax = fig.add_subplot(rows, cols, i+1)
    ax.set_title("author" +str(i))
    sns.barplot(x=x,y=y)

train['text_l'] = train['text'].str.lower() 
test['text_l'] = test['text'].str.lower()

소문자 텍스트만 있는 'text_l' 컬럼 생성

fig = plt.figure(figsize=(17,10))
rows = 2
cols = 3


def plot_top_non_stopwords_barchart3(text):
    stop=set(stopwords.words('english'))

    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]

    counter=Counter(corpus)
    most=counter.most_common()
    x, y=[], []
    for word,count in most[:70]:
        if (word not in stop):
            x.append(word)
            y.append(count)
    return x, y

for i in range(5):
    x,y = plot_top_non_stopwords_barchart3(train[train['author']==i]['text_l'])
    ax = fig.add_subplot(rows, cols, i+1)
    ax.set_title("author" +str(i))
    sns.barplot(x=y,y=x)

데이터 안에 있는 텍스트를 모두 소문자로 변환 후 불용어를 제외한 빈도를 확인하니 공통적으로 "odin", "said"라는 단어가 상위에 있다는 점을 확인할 수 있었습니다. 작가를 예측하는데 객관성을 얻기 힘든 단어라 판단하여 불용어 단어 추가를 이용해 제거합니다.

stop.update(('odin', 'odin,', 'odin.','said'))

0 : mr. > would > one > little > upon > could
1 : could > would > mr. > mrs. > must > miss
2 : upon > one > would > could > "i > man
3 : one > would > though > could > know > like
4 : upon > one > would > like > man > "i

fig = plt.figure(figsize=(16,10)) # rows*cols 행렬의 i번째 subplot 생성
rows = 2
cols = 3

def plot_top_ngrams_barchart2(text, n=2):
    stop=set(stopwords.words('english'))

    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]

    def _get_top_ngram(corpus, n=None):
        vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) 
                      for word, idx in vec.vocabulary_.items()]
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
        return words_freq[:10]

    top_n_bigrams=_get_top_ngram(text,n)[:10]
    x,y=map(list,zip(*top_n_bigrams))
    
    return x, y

for i in range(5):
    x,y = plot_top_ngrams_barchart2(train[train['author']==i]['text_l'], n=3)
    ax = fig.add_subplot(rows, cols, i+1)
    ax.set_title(i)
    sns.barplot(x=y,y=x)

n-gram 을 이용해서 연속적인 토큰을 통해 빈도를 알아보았습니다.

하지만 새로운 코드들이 많이 출현한 함수이기 때문에 코드 리뷰를 진행합니다.

# CountVectorizer( )의 기능

문장을 토큰 리스트로 변환
토큰의 빈도 카운트
BOW 인코딩 벡터로 변환

BOW 인코딩 벡터란?

Bag of Words 의 약자로 단어의 순서를 고려하지 않고 빈도만 수치화 하는 표현 방법입니다.

vec = CountVectorizer(ngram_range(3, 3)).fit(text)

ngram_range(3, 3) 으로 설정하여 3 부터 3 까지 이기 때문에 3개의 토큰으로만 연속된 문장이 만들어지게 됩니다. text 가 들어가 vec 안에는 3개의 연속된 토큰으로 된 문장과 위치가 딕셔너리 형식으로 들어가게 됩니다.

bag_of_words = vec.transform(corpus)

bag_of_words 안에는 문장에 대한 위치가 원-핫 인코딩 형식으로 들어가게 됩니다.

sum_words = bag_of_words.sum(axis=0)

sum_words 안에는 행 기준으로 같은 위치끼리의 1 값이 합쳐져 빈도 수가 들어가게 됩니다.

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

3개의 토큰으로 된 문장에 대한 고유한 인덱스를 빈도수가 담겨있는 sum_words 에 들어가 빈도수를 반환하게 됩니다.

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:10]

key 설정을 이용해 빈도수를 정렬기준으로 삼아 내림차순으로 정렬 후 상위 10번째까지 반환하였습니다.

저작자표시 비영리 변경금지

현재글[Dacon] 소설 작가 분류 AI 경진대회 정리[01.19 ~ 01.20]

Jerry StoryWalk