emotion-chatbot-app / notebooks /text_cleaner.py
hfexample's picture
Deploy clean snapshot of the repository
e221c83
import json
import re
import os
# 1. ํ…์ŠคํŠธ ์ •์ œ ํ•จ์ˆ˜ ์ •์˜ (train_final.py์˜ ๋กœ์ง)
def clean_text(text: str) -> str:
"""ํ•œ๊ธ€, ์˜์–ด, ์ˆซ์ž, ๊ณต๋ฐฑ์„ ์ œ์™ธํ•œ ๋ชจ๋“  ํŠน์ˆ˜๋ฌธ์ž๋ฅผ ์ œ๊ฑฐํ•ฉ๋‹ˆ๋‹ค."""
return re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9 ]', '', str(text))
# 2. ํŒŒ์ผ ๊ฒฝ๋กœ ์„ค์ • ๋ฐ ๋ฐ์ดํ„ฐ ๋กœ๋“œ (ํŒŒ์ผ ๊ฒฝ๋กœ๊ฐ€ data/์— ์žˆ๋‹ค๊ณ  ๊ฐ€์ •)
file_path = './data/training-label.json'
try:
with open(file_path, 'r', encoding='utf-8') as f:
training_data_raw = json.load(f)
print(f"โœ… '{file_path}' ํŒŒ์ผ ๋กœ๋”ฉ ์„ฑ๊ณต. ์ด {len(training_data_raw)}๊ฐœ ๋ฐ์ดํ„ฐ ์ค‘ 10๊ฐœ๋งŒ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.")
print("---------------------------------------------------\n")
# 3. ์ฒซ 10๊ฐœ ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•ด ์ฒ˜๋ฆฌ ๋ฐ ๋น„๊ต
comparison_data = []
# training_data_raw๋Š” ๋Œ€ํ™” ๋‹จ์œ„์˜ ๋ฆฌ์ŠคํŠธ์ž…๋‹ˆ๋‹ค.
for i, data in enumerate(training_data_raw[:5]):
# ๋Œ€ํ™”์˜ ๋ชจ๋“  ๋ฌธ์žฅ์„ ๊ณต๋ฐฑ์œผ๋กœ ์—ฐ๊ฒฐํ•˜์—ฌ ์›๋ณธ ํ…์ŠคํŠธ๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค. (explore_data.py ๋กœ์ง)
raw_text = " ".join(data['talk']['content'].values())
cleaned_text = clean_text(raw_text)
# ์›๋ณธ ๋ฐ์ดํ„ฐ์˜ E์ฝ”๋“œ ๊ฐ์ • ์ถ”์ถœ (์ฐธ๊ณ ์šฉ)
emotion_type = data['profile']['emotion']['type']
comparison_data.append({
'ID': i + 1,
'Emotion': emotion_type,
'Raw Text': raw_text,
'Cleaned Text': cleaned_text
})
# 4. ๊ฒฐ๊ณผ ์ถœ๋ ฅ
for item in comparison_data:
print(f"--- ID: {item['ID']} (๊ฐ์ • ์ฝ”๋“œ: {item['Emotion']}) ---")
print(f" ์›๋ณธ (Raw) : {item['Raw Text']}")
print(f" ์ •์ œ (Clean): {item['Cleaned Text']}")
print("-" * 30)
except FileNotFoundError:
print(f"โŒ ์˜ค๋ฅ˜: ๋ฐ์ดํ„ฐ ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๊ฒฝ๋กœ๋ฅผ ํ™•์ธํ•˜์„ธ์š”: {os.path.abspath(file_path)}")
except Exception as e:
print(f"โŒ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")