Spaces:
Running
Running
| import pandas as pd | |
| import json | |
| import re | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # --- Matplotlib νκΈ ν°νΈ μ€μ (Windows: Malgun Gothic) --- | |
| try: | |
| plt.rcParams['font.family'] = 'Malgun Gothic' | |
| plt.rcParams['axes.unicode_minus'] = False | |
| except: | |
| print("νκΈ ν°νΈ μ€μ μ μ€ν¨νμ΅λλ€. κ·Έλνμ λΌλ²¨μ΄ κΉ¨μ§ μ μμ΅λλ€.") | |
| # --- κ°μ λ§€ν ν¨μ μ μ --- | |
| def map_emotion_code(ecode): | |
| """ | |
| Eμ½λ λ¬Έμμ΄μ λλΆλ₯ κ°μ λ¬Έμμ΄λ‘ λ§€νν©λλ€. (μ: 'E11' -> 'λΆλ Έ') | |
| """ | |
| # Eμ½λ λ¬Έμμ΄μ΄ μλκ±°λ νμμ΄ λ§μ§ μμΌλ©΄ None λ°ν | |
| if not isinstance(ecode, str) or len(ecode) < 2 or ecode[0] != 'E': | |
| return None | |
| try: | |
| # 'E'λ₯Ό μ κ±°νκ³ μ«μ λΆλΆλ§ μΆμΆ | |
| code_num = int(ecode[1:]) | |
| except ValueError: | |
| return None | |
| if 10 <= code_num <= 19: | |
| return 'λΆλ Έ' | |
| elif 20 <= code_num <= 29: | |
| return 'μ¬ν' | |
| elif 30 <= code_num <= 39: | |
| return 'λΆμ' | |
| elif 40 <= code_num <= 49: | |
| return 'μμ²' | |
| elif 50 <= code_num <= 59: | |
| return 'λΉν©' | |
| elif 60 <= code_num <= 69: | |
| return 'κΈ°μ¨' | |
| else: | |
| return None | |
| # --- [Phase 1] λ°μ΄ν° λ‘λ© λ° λ³ν© --- | |
| print("---" + "[Phase 1] λ°μ΄ν° λ‘λ© λ° λ³ν© μμ" + "---") | |
| # νμΌ κ²½λ‘ μ€μ | |
| data_path = 'data/' | |
| train_text_path = data_path + 'training-origin.xlsx' | |
| train_label_path = data_path + 'training-label.json' | |
| val_text_path = data_path + 'validation-origin.xlsx' | |
| val_label_path = data_path + 'test.json' | |
| # 1. λ°μ΄ν° λΆλ¬μ€κΈ° | |
| try: | |
| df_train_text = pd.read_excel(train_text_path, header=0) | |
| df_val_text = pd.read_excel(val_text_path, header=0) | |
| with open(train_label_path, 'r', encoding='utf-8') as f: | |
| train_labels_raw = json.load(f) | |
| with open(val_label_path, 'r', encoding='utf-8') as f: | |
| val_labels_raw = json.load(f) | |
| print("νμΌ λ‘λ© μ±κ³΅!") | |
| except FileNotFoundError as e: | |
| print(f"νμΌμ μ°Ύμ μ μμ΅λλ€: {e}") | |
| print("νμΌ κ²½λ‘μ νμΌ μ΄λ¦μ λ€μ νμΈν΄μ£ΌμΈμ.") | |
| exit() | |
| # 2. λΌλ²¨ λ°μ΄ν° μ μ λ° μΆμΆ | |
| def extract_emotions(raw_labels): | |
| emotions = [] | |
| for dialogue in raw_labels: | |
| try: | |
| emotions.append(dialogue['profile']['emotion']['type']) | |
| except KeyError: | |
| emotions.append(None) | |
| return emotions | |
| df_train_labels = pd.DataFrame({'emotion': extract_emotions(train_labels_raw)}) | |
| df_val_labels = pd.DataFrame({'emotion': extract_emotions(val_labels_raw)}) | |
| # 3. ν μ€νΈ λ°μ΄ν°μ λΌλ²¨ λ°μ΄ν° λ³ν© | |
| def combine_dialogues(df): | |
| dialogue_cols = [col for col in df.columns if 'λ¬Έμ₯' in str(col)] | |
| for col in dialogue_cols: | |
| df[col] = df[col].astype(str).fillna('') | |
| df['text'] = df[dialogue_cols].apply(lambda row: ' '.join(row), axis=1) | |
| return df | |
| df_train = pd.concat([df_train_text, df_train_labels], axis=1) | |
| df_val = pd.concat([df_val_text, df_val_labels], axis=1) | |
| df_train = combine_dialogues(df_train) | |
| df_val = combine_dialogues(df_val) | |
| # μλ³Έ Eμ½λ(emotion)λ₯Ό λλΆλ₯ κ°μ (major_emotion)μΌλ‘ λ§€ννκ³ , λ§€νλμ§ μμ λ°μ΄ν°λ μ κ±°ν©λλ€. | |
| df_train['major_emotion'] = df_train['emotion'].apply(map_emotion_code) | |
| df_val['major_emotion'] = df_val['emotion'].apply(map_emotion_code) | |
| df_train.dropna(subset=['major_emotion'], inplace=True) | |
| df_val.dropna(subset=['major_emotion'], inplace=True) | |
| # 4. νλ ¨ λ°μ΄ν°μ κ²μ¦ λ°μ΄ν° ν΅ν© | |
| df_combined = pd.concat([df_train, df_val], ignore_index=True) | |
| print("\n--- ν΅ν© λ°μ΄ν°νλ μμ 첫 5μ€ (λ§€ν ν) ---") | |
| print(df_combined[['text', 'emotion', 'major_emotion']].head()) | |
| print("\n--- ν΅ν© λ°μ΄ν°νλ μ ν¬κΈ° ---") | |
| print(f"ν΅ν© λ°μ΄ν°: {df_combined.shape}") | |
| print("--- [Phase 1] μλ£ ---") | |
| # --- [Phase 2] λ°μ΄ν° νμ λ° μ μ²λ¦¬ --- | |
| print("\n---" + "[Phase 2] λ°μ΄ν° νμ λ° μ μ²λ¦¬ μμ" + "---") | |
| # 1. λ°μ΄ν° νμ λ° μκ°ν | |
| print("\n---" + "ν΅ν© λ°μ΄ν° (νλ ¨ + κ²μ¦) κ°μ λΆν¬" + "---") | |
| emotion_counts = df_combined['major_emotion'].value_counts() | |
| print(emotion_counts) | |
| print("-------------------------------------------\n") | |
| # κ°μ λΆν¬ μκ°ν | |
| plt.figure(figsize=(10, 6)) | |
| sns.barplot(x=emotion_counts.values, y=emotion_counts.index, color='#2c7bb6') | |
| for index, value in enumerate(emotion_counts.values): | |
| plt.text(x=value + 100, y=index, s=f'{value:,}', va='center', ha='left', fontsize=12, color='black') | |
| plt.title('νλ ¨ + κ²μ¦ λ°μ΄ν° ν΅ν© κ°μ λΆν¬ μκ°ν', fontsize=15) | |
| plt.xlabel('κ°μ', fontsize=12) | |
| plt.ylabel('κ°μ ', fontsize=12) | |
| plt.grid(axis='x', linestyle='--', alpha=0.7) | |
| plt.xlim(0, 15000) | |
| plt.ticklabel_format(style='plain', axis='x') | |
| plt.show() | |
| print("\nμκ°ν μλ£. κ·Έλν μ°½μ λ«μΌλ©΄ λ€μ λ¨κ³κ° μ§νλ©λλ€.") | |
| # 2. ν μ€νΈ μ μ | |
| print("\n---" + "ν μ€νΈ μ μ μμ" + "---") | |
| def clean_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| # μ κ·ννμμ μ¬μ©νμ¬ νκΈ, μμ΄, μ«μ, 곡백μ μ μΈν λͺ¨λ λ¬Έμ μ κ±° | |
| return re.sub(r'[^κ°-ν£a-zA-Z0-9 ]', '', text) | |
| df_combined['cleaned_text'] = df_combined['text'].apply(clean_text) | |
| print("ν μ€νΈ μ μ μλ£.") | |
| print(df_combined[['text', 'cleaned_text', 'major_emotion']].head()) | |
| print("--- [Phase 2] μλ£ ---") | |
| print("\nλͺ¨λ κ³Όμ μ΄ μλ£λμμ΅λλ€. μ΄μ μ΄ λ°μ΄ν°νλ μ(df_combined)μΌλ‘ λΆμμ κ³μ μ§νν μ μμ΅λλ€.") |