emotion-chatbot-app / notebooks /explore_data.py
hfexample's picture
Deploy clean snapshot of the repository
e221c83
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
# --- Matplotlib ν•œκΈ€ 폰트 μ„€μ • (Windows: Malgun Gothic) ---
try:
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
except:
print("ν•œκΈ€ 폰트 섀정에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€. κ·Έλž˜ν”„μ˜ 라벨이 깨질 수 μžˆμŠ΅λ‹ˆλ‹€.")
# --- 감정 λ§€ν•‘ ν•¨μˆ˜ μ •μ˜ ---
def map_emotion_code(ecode):
"""
Eμ½”λ“œ λ¬Έμžμ—΄μ„ λŒ€λΆ„λ₯˜ 감정 λ¬Έμžμ—΄λ‘œ λ§€ν•‘ν•©λ‹ˆλ‹€. (예: 'E11' -> 'λΆ„λ…Έ')
"""
# Eμ½”λ“œ λ¬Έμžμ—΄μ΄ μ•„λ‹ˆκ±°λ‚˜ ν˜•μ‹μ΄ λ§žμ§€ μ•ŠμœΌλ©΄ None λ°˜ν™˜
if not isinstance(ecode, str) or len(ecode) < 2 or ecode[0] != 'E':
return None
try:
# 'E'λ₯Ό μ œκ±°ν•˜κ³  숫자 λΆ€λΆ„λ§Œ μΆ”μΆœ
code_num = int(ecode[1:])
except ValueError:
return None
if 10 <= code_num <= 19:
return 'λΆ„λ…Έ'
elif 20 <= code_num <= 29:
return 'μŠ¬ν””'
elif 30 <= code_num <= 39:
return 'λΆˆμ•ˆ'
elif 40 <= code_num <= 49:
return 'μƒμ²˜'
elif 50 <= code_num <= 59:
return 'λ‹Ήν™©'
elif 60 <= code_num <= 69:
return '기쁨'
else:
return None
# --- [Phase 1] 데이터 λ‘œλ”© 및 병합 ---
print("---" + "[Phase 1] 데이터 λ‘œλ”© 및 병합 μ‹œμž‘" + "---")
# 파일 경둜 μ„€μ •
data_path = 'data/'
train_text_path = data_path + 'training-origin.xlsx'
train_label_path = data_path + 'training-label.json'
val_text_path = data_path + 'validation-origin.xlsx'
val_label_path = data_path + 'test.json'
# 1. 데이터 뢈러였기
try:
df_train_text = pd.read_excel(train_text_path, header=0)
df_val_text = pd.read_excel(val_text_path, header=0)
with open(train_label_path, 'r', encoding='utf-8') as f:
train_labels_raw = json.load(f)
with open(val_label_path, 'r', encoding='utf-8') as f:
val_labels_raw = json.load(f)
print("파일 λ‘œλ”© 성곡!")
except FileNotFoundError as e:
print(f"νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {e}")
print("파일 κ²½λ‘œμ™€ 파일 이름을 λ‹€μ‹œ ν™•μΈν•΄μ£Όμ„Έμš”.")
exit()
# 2. 라벨 데이터 μ •μ œ 및 μΆ”μΆœ
def extract_emotions(raw_labels):
emotions = []
for dialogue in raw_labels:
try:
emotions.append(dialogue['profile']['emotion']['type'])
except KeyError:
emotions.append(None)
return emotions
df_train_labels = pd.DataFrame({'emotion': extract_emotions(train_labels_raw)})
df_val_labels = pd.DataFrame({'emotion': extract_emotions(val_labels_raw)})
# 3. ν…μŠ€νŠΈ 데이터와 라벨 데이터 병합
def combine_dialogues(df):
dialogue_cols = [col for col in df.columns if 'λ¬Έμž₯' in str(col)]
for col in dialogue_cols:
df[col] = df[col].astype(str).fillna('')
df['text'] = df[dialogue_cols].apply(lambda row: ' '.join(row), axis=1)
return df
df_train = pd.concat([df_train_text, df_train_labels], axis=1)
df_val = pd.concat([df_val_text, df_val_labels], axis=1)
df_train = combine_dialogues(df_train)
df_val = combine_dialogues(df_val)
# 원본 Eμ½”λ“œ(emotion)λ₯Ό λŒ€λΆ„λ₯˜ 감정(major_emotion)으둜 λ§€ν•‘ν•˜κ³ , λ§€ν•‘λ˜μ§€ μ•Šμ€ λ°μ΄ν„°λŠ” μ œκ±°ν•©λ‹ˆλ‹€.
df_train['major_emotion'] = df_train['emotion'].apply(map_emotion_code)
df_val['major_emotion'] = df_val['emotion'].apply(map_emotion_code)
df_train.dropna(subset=['major_emotion'], inplace=True)
df_val.dropna(subset=['major_emotion'], inplace=True)
# 4. ν›ˆλ ¨ 데이터와 검증 데이터 톡합
df_combined = pd.concat([df_train, df_val], ignore_index=True)
print("\n--- 톡합 λ°μ΄ν„°ν”„λ ˆμž„μ˜ 첫 5쀄 (λ§€ν•‘ ν›„) ---")
print(df_combined[['text', 'emotion', 'major_emotion']].head())
print("\n--- 톡합 λ°μ΄ν„°ν”„λ ˆμž„ 크기 ---")
print(f"톡합 데이터: {df_combined.shape}")
print("--- [Phase 1] μ™„λ£Œ ---")
# --- [Phase 2] 데이터 탐색 및 μ „μ²˜λ¦¬ ---
print("\n---" + "[Phase 2] 데이터 탐색 및 μ „μ²˜λ¦¬ μ‹œμž‘" + "---")
# 1. 데이터 탐색 및 μ‹œκ°ν™”
print("\n---" + "톡합 데이터 (ν›ˆλ ¨ + 검증) 감정 뢄포" + "---")
emotion_counts = df_combined['major_emotion'].value_counts()
print(emotion_counts)
print("-------------------------------------------\n")
# 감정 뢄포 μ‹œκ°ν™”
plt.figure(figsize=(10, 6))
sns.barplot(x=emotion_counts.values, y=emotion_counts.index, color='#2c7bb6')
for index, value in enumerate(emotion_counts.values):
plt.text(x=value + 100, y=index, s=f'{value:,}', va='center', ha='left', fontsize=12, color='black')
plt.title('ν›ˆλ ¨ + 검증 데이터 톡합 감정 뢄포 μ‹œκ°ν™”', fontsize=15)
plt.xlabel('개수', fontsize=12)
plt.ylabel('감정', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xlim(0, 15000)
plt.ticklabel_format(style='plain', axis='x')
plt.show()
print("\nμ‹œκ°ν™” μ™„λ£Œ. κ·Έλž˜ν”„ 창을 λ‹«μœΌλ©΄ λ‹€μŒ 단계가 μ§„ν–‰λ©λ‹ˆλ‹€.")
# 2. ν…μŠ€νŠΈ μ •μ œ
print("\n---" + "ν…μŠ€νŠΈ μ •μ œ μ‹œμž‘" + "---")
def clean_text(text):
if not isinstance(text, str):
return ""
# μ •κ·œν‘œν˜„μ‹μ„ μ‚¬μš©ν•˜μ—¬ ν•œκΈ€, μ˜μ–΄, 숫자, 곡백을 μ œμ™Έν•œ λͺ¨λ“  문자 제거
return re.sub(r'[^κ°€-힣a-zA-Z0-9 ]', '', text)
df_combined['cleaned_text'] = df_combined['text'].apply(clean_text)
print("ν…μŠ€νŠΈ μ •μ œ μ™„λ£Œ.")
print(df_combined[['text', 'cleaned_text', 'major_emotion']].head())
print("--- [Phase 2] μ™„λ£Œ ---")
print("\nλͺ¨λ“  과정이 μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€. 이제 이 λ°μ΄ν„°ν”„λ ˆμž„(df_combined)으둜 뢄석을 계속 μ§„ν–‰ν•  수 μžˆμŠ΅λ‹ˆλ‹€.")