Spaces:

taehoon222
/

emotion-chatbot-app

Running

App Files Files Community

emotion-chatbot-app / notebooks /explore_data.py

hfexample

Deploy clean snapshot of the repository

e221c83 13 days ago

raw

history blame contribute delete

5.58 kB

	import pandas as pd
	import json
	import re
	import matplotlib.pyplot as plt
	import seaborn as sns

	# --- Matplotlib 한글 폰트 설정 (Windows: Malgun Gothic) ---
	try:
	plt.rcParams['font.family'] = 'Malgun Gothic'
	plt.rcParams['axes.unicode_minus'] = False
	except:
	print("한글 폰트 설정에 실패했습니다. 그래프의 라벨이 깨질 수 있습니다.")

	# --- 감정 매핑 함수 정의 ---
	def map_emotion_code(ecode):
	"""
	E코드 문자열을 대분류 감정 문자열로 매핑합니다. (예: 'E11' -> '분노')
	"""
	# E코드 문자열이 아니거나 형식이 맞지 않으면 None 반환
	if not isinstance(ecode, str) or len(ecode) < 2 or ecode[0] != 'E':
	return None

	try:
	# 'E'를 제거하고 숫자 부분만 추출
	code_num = int(ecode[1:])
	except ValueError:
	return None

	if 10 <= code_num <= 19:
	return '분노'
	elif 20 <= code_num <= 29:
	return '슬픔'
	elif 30 <= code_num <= 39:
	return '불안'
	elif 40 <= code_num <= 49:
	return '상처'
	elif 50 <= code_num <= 59:
	return '당황'
	elif 60 <= code_num <= 69:
	return '기쁨'
	else:
	return None

	# --- [Phase 1] 데이터 로딩 및 병합 ---
	print("---" + "[Phase 1] 데이터 로딩 및 병합 시작" + "---")

	# 파일 경로 설정
	data_path = 'data/'
	train_text_path = data_path + 'training-origin.xlsx'
	train_label_path = data_path + 'training-label.json'
	val_text_path = data_path + 'validation-origin.xlsx'
	val_label_path = data_path + 'test.json'

	# 1. 데이터 불러오기
	try:
	df_train_text = pd.read_excel(train_text_path, header=0)
	df_val_text = pd.read_excel(val_text_path, header=0)

	with open(train_label_path, 'r', encoding='utf-8') as f:
	train_labels_raw = json.load(f)
	with open(val_label_path, 'r', encoding='utf-8') as f:
	val_labels_raw = json.load(f)

	print("파일 로딩 성공!")

	except FileNotFoundError as e:
	print(f"파일을 찾을 수 없습니다: {e}")
	print("파일 경로와 파일 이름을 다시 확인해주세요.")
	exit()


	# 2. 라벨 데이터 정제 및 추출
	def extract_emotions(raw_labels):
	emotions = []
	for dialogue in raw_labels:
	try:
	emotions.append(dialogue['profile']['emotion']['type'])
	except KeyError:
	emotions.append(None)
	return emotions

	df_train_labels = pd.DataFrame({'emotion': extract_emotions(train_labels_raw)})
	df_val_labels = pd.DataFrame({'emotion': extract_emotions(val_labels_raw)})

	# 3. 텍스트 데이터와 라벨 데이터 병합
	def combine_dialogues(df):
	dialogue_cols = [col for col in df.columns if '문장' in str(col)]
	for col in dialogue_cols:
	df[col] = df[col].astype(str).fillna('')
	df['text'] = df[dialogue_cols].apply(lambda row: ' '.join(row), axis=1)
	return df

	df_train = pd.concat([df_train_text, df_train_labels], axis=1)
	df_val = pd.concat([df_val_text, df_val_labels], axis=1)
	df_train = combine_dialogues(df_train)
	df_val = combine_dialogues(df_val)


	# 원본 E코드(emotion)를 대분류 감정(major_emotion)으로 매핑하고, 매핑되지 않은 데이터는 제거합니다.
	df_train['major_emotion'] = df_train['emotion'].apply(map_emotion_code)
	df_val['major_emotion'] = df_val['emotion'].apply(map_emotion_code)

	df_train.dropna(subset=['major_emotion'], inplace=True)
	df_val.dropna(subset=['major_emotion'], inplace=True)

	# 4. 훈련 데이터와 검증 데이터 통합
	df_combined = pd.concat([df_train, df_val], ignore_index=True)


	print("\n--- 통합 데이터프레임의 첫 5줄 (매핑 후) ---")
	print(df_combined[['text', 'emotion', 'major_emotion']].head())
	print("\n--- 통합 데이터프레임 크기 ---")
	print(f"통합 데이터: {df_combined.shape}")
	print("--- [Phase 1] 완료 ---")


	# --- [Phase 2] 데이터 탐색 및 전처리 ---
	print("\n---" + "[Phase 2] 데이터 탐색 및 전처리 시작" + "---")

	# 1. 데이터 탐색 및 시각화
	print("\n---" + "통합 데이터 (훈련 + 검증) 감정 분포" + "---")
	emotion_counts = df_combined['major_emotion'].value_counts()
	print(emotion_counts)
	print("-------------------------------------------\n")


	# 감정 분포 시각화
	plt.figure(figsize=(10, 6))
	sns.barplot(x=emotion_counts.values, y=emotion_counts.index, color='#2c7bb6')
	for index, value in enumerate(emotion_counts.values):
	plt.text(x=value + 100, y=index, s=f'{value:,}', va='center', ha='left', fontsize=12, color='black')

	plt.title('훈련 + 검증 데이터 통합 감정 분포 시각화', fontsize=15)
	plt.xlabel('개수', fontsize=12)
	plt.ylabel('감정', fontsize=12)
	plt.grid(axis='x', linestyle='--', alpha=0.7)
	plt.xlim(0, 15000)
	plt.ticklabel_format(style='plain', axis='x')

	plt.show()

	print("\n시각화 완료. 그래프 창을 닫으면 다음 단계가 진행됩니다.")

	# 2. 텍스트 정제
	print("\n---" + "텍스트 정제 시작" + "---")

	def clean_text(text):
	if not isinstance(text, str):
	return ""
	# 정규표현식을 사용하여 한글, 영어, 숫자, 공백을 제외한 모든 문자 제거
	return re.sub(r'[^가-힣a-zA-Z0-9 ]', '', text)


	df_combined['cleaned_text'] = df_combined['text'].apply(clean_text)

	print("텍스트 정제 완료.")
	print(df_combined[['text', 'cleaned_text', 'major_emotion']].head())
	print("--- [Phase 2] 완료 ---")

	print("\n모든 과정이 완료되었습니다. 이제 이 데이터프레임(df_combined)으로 분석을 계속 진행할 수 있습니다.")