|
|
import gradio as gr
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
import seaborn as sns
|
|
|
from wordcloud import WordCloud
|
|
|
import jieba
|
|
|
import jieba.analyse
|
|
|
from collections import Counter
|
|
|
import numpy as np
|
|
|
from pathlib import Path
|
|
|
import re
|
|
|
from datetime import datetime
|
|
|
import platform
|
|
|
import os
|
|
|
from typing import List, Optional, Tuple
|
|
|
import warnings
|
|
|
from sklearn.cluster import KMeans, DBSCAN
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
from sklearn.decomposition import PCA
|
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from sklearn.metrics import classification_report, mean_squared_error, r2_score
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
from scipy import stats
|
|
|
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
|
import networkx as nx
|
|
|
from matplotlib.patches import Rectangle
|
|
|
import os
|
|
|
import urllib.request
|
|
|
from pathlib import Path
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
sns.set_style("whitegrid")
|
|
|
plt.style.use('seaborn-v0_8-darkgrid')
|
|
|
|
|
|
|
|
|
STOPWORDS = set([
|
|
|
|
|
|
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一个', '也', '很', '到',
|
|
|
'说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这个', '还', '可以', '比较',
|
|
|
'里', '上', '能', '多', '里面', '感觉', '觉得', '然后', '但是', '如果', '因为', '所以',
|
|
|
|
|
|
'非常', '特别', '十分', '挺', '蛮', '相当', '比较', '更', '最',
|
|
|
|
|
|
'时候', '现在', '已经', '刚', '正在', '曾经', '以前', '之前', '以后', '之后',
|
|
|
|
|
|
'我们', '你们', '他们', '她们', '它们', '这里', '那里', '哪里', '什么', '怎么', '为什么',
|
|
|
|
|
|
'啊', '呀', '吧', '呢', '哦', '哈', '嗯', '唉',
|
|
|
|
|
|
'而且', '并且', '或者', '还是', '以及', '及其', '虽然', '但是',
|
|
|
|
|
|
'这样', '那样', '怎样', '这种', '那种', '如此', '确实', '真的', '实在', '其实', '当然',
|
|
|
'只是', '就是', '而已', '罢了', '左右', '上下', '之类', '等等', '之类的',
|
|
|
'一下', '一点', '有点', '一些', '这些', '那些', '哪些', '每个', '各种', '所有',
|
|
|
'进行', '开始', '结束', '成为', '变成', '得到', '拥有', '出现', '发现', '认为',
|
|
|
'表示', '通过', '根据', '按照', '由于', '关于', '对于', '至于', '作为'
|
|
|
])
|
|
|
|
|
|
|
|
|
def download_chinese_font():
|
|
|
"""自动下载中文字体到项目目录"""
|
|
|
font_dir = Path("fonts")
|
|
|
font_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
font_path = font_dir / "SourceHanSansSC-Regular.otf"
|
|
|
|
|
|
if font_path.exists():
|
|
|
print(f"✅ 字体已存在: {font_path}")
|
|
|
return str(font_path)
|
|
|
|
|
|
print("📥 正在下载思源黑体...")
|
|
|
|
|
|
|
|
|
font_urls = [
|
|
|
"https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf",
|
|
|
"https://ghproxy.com/https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf",
|
|
|
"https://cdn.jsdelivr.net/gh/adobe-fonts/source-han-sans@release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"
|
|
|
]
|
|
|
|
|
|
for url in font_urls:
|
|
|
try:
|
|
|
print(f"尝试从 {url[:50]}... 下载")
|
|
|
urllib.request.urlretrieve(url, font_path)
|
|
|
print(f"✅ 字体下载成功: {font_path}")
|
|
|
return str(font_path)
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ 下载失败: {e}")
|
|
|
continue
|
|
|
|
|
|
print("❌ 所有字体源下载失败")
|
|
|
return None
|
|
|
|
|
|
|
|
|
def get_chinese_font():
|
|
|
"""获取中文字体路径(优先下载)"""
|
|
|
|
|
|
downloaded_font = download_chinese_font()
|
|
|
if downloaded_font and os.path.exists(downloaded_font):
|
|
|
return downloaded_font
|
|
|
|
|
|
|
|
|
system_fonts = [
|
|
|
'/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
|
|
|
'/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc',
|
|
|
'/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc',
|
|
|
]
|
|
|
|
|
|
for font in system_fonts:
|
|
|
if os.path.exists(font):
|
|
|
print(f"✅ 找到系统字体: {font}")
|
|
|
return font
|
|
|
|
|
|
|
|
|
system = platform.system()
|
|
|
if system == 'Windows':
|
|
|
win_fonts = ['C:/Windows/Fonts/msyh.ttc', 'C:/Windows/Fonts/simhei.ttf']
|
|
|
for font in win_fonts:
|
|
|
if os.path.exists(font):
|
|
|
return font
|
|
|
elif system == 'Darwin':
|
|
|
mac_fonts = ['/System/Library/Fonts/STHeiti Light.ttc']
|
|
|
for font in mac_fonts:
|
|
|
if os.path.exists(font):
|
|
|
return font
|
|
|
|
|
|
print("⚠️ 未找到任何中文字体")
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
CHINESE_FONT = get_chinese_font()
|
|
|
|
|
|
|
|
|
if CHINESE_FONT:
|
|
|
import matplotlib.pyplot as plt
|
|
|
from matplotlib import font_manager
|
|
|
|
|
|
|
|
|
font_manager.fontManager.addfont(CHINESE_FONT)
|
|
|
font_prop = font_manager.FontProperties(fname=CHINESE_FONT)
|
|
|
|
|
|
plt.rcParams['font.sans-serif'] = [font_prop.get_name()]
|
|
|
plt.rcParams['font.family'] = 'sans-serif'
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
|
print(f"✅ Matplotlib 已配置字体: {font_prop.get_name()}")
|
|
|
else:
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
print("⚠️ 使用默认字体(可能不支持中文)")
|
|
|
|
|
|
|
|
|
class TourismDataAnalyzer:
|
|
|
def __init__(self, data_folder='data'):
|
|
|
self.data_folder = Path(data_folder)
|
|
|
self.all_data = None
|
|
|
self.load_all_data()
|
|
|
|
|
|
def get_preview_data(self, n_samples=100):
|
|
|
"""获取预览数据(用户名匿名化)"""
|
|
|
df_preview = self.all_data.copy()
|
|
|
|
|
|
|
|
|
unique_users = df_preview['用户名'].unique()
|
|
|
user_mapping = {user: f'用户{i + 1:04d}' for i, user in enumerate(unique_users)}
|
|
|
df_preview['用户名'] = df_preview['用户名'].map(user_mapping)
|
|
|
|
|
|
|
|
|
display_columns = ['用户名', '景点', '评分', '时间', '评论内容', '评论长度', '情感分类']
|
|
|
df_preview = df_preview[display_columns]
|
|
|
|
|
|
|
|
|
df_preview['时间'] = df_preview['时间'].dt.strftime('%Y-%m-%d %H:%M')
|
|
|
|
|
|
|
|
|
df_preview['评论内容'] = df_preview['评论内容'].apply(
|
|
|
lambda x: x[:100] + '...' if len(str(x)) > 100 else x
|
|
|
)
|
|
|
|
|
|
|
|
|
if len(df_preview) > n_samples:
|
|
|
df_preview = df_preview.sample(n=n_samples, random_state=42)
|
|
|
|
|
|
|
|
|
df_preview = df_preview.reset_index(drop=True)
|
|
|
df_preview.index = df_preview.index + 1
|
|
|
|
|
|
return df_preview.sort_values('时间', ascending=False)
|
|
|
|
|
|
def load_all_data(self):
|
|
|
"""加载所有Excel文件"""
|
|
|
all_dfs = []
|
|
|
for file_path in self.data_folder.glob('*.xlsx'):
|
|
|
try:
|
|
|
df = pd.read_excel(file_path)
|
|
|
required_columns = ['用户名', '时间', '评分', '评论内容']
|
|
|
if not all(col in df.columns for col in required_columns):
|
|
|
print(f"⚠️ {file_path.name} 缺少必要列,跳过")
|
|
|
continue
|
|
|
|
|
|
df['景点'] = file_path.stem
|
|
|
all_dfs.append(df)
|
|
|
print(f"✓ 成功加载: {file_path.name} ({len(df)} 条)")
|
|
|
except Exception as e:
|
|
|
print(f"✗ 加载 {file_path.name} 失败: {e}")
|
|
|
|
|
|
if not all_dfs:
|
|
|
raise ValueError("未找到任何有效数据文件!")
|
|
|
|
|
|
self.all_data = pd.concat(all_dfs, ignore_index=True)
|
|
|
|
|
|
|
|
|
self.all_data['时间'] = pd.to_datetime(self.all_data['时间'], errors='coerce')
|
|
|
self.all_data['评分'] = pd.to_numeric(self.all_data['评分'], errors='coerce')
|
|
|
self.all_data = self.all_data.dropna(subset=['时间', '评分', '评论内容'])
|
|
|
|
|
|
|
|
|
self.all_data['评论长度'] = self.all_data['评论内容'].str.len()
|
|
|
self.all_data['年份'] = self.all_data['时间'].dt.year
|
|
|
self.all_data['月份'] = self.all_data['时间'].dt.month
|
|
|
self.all_data['季度'] = self.all_data['时间'].dt.quarter
|
|
|
self.all_data['星期'] = self.all_data['时间'].dt.dayofweek
|
|
|
self.all_data['是否周末'] = self.all_data['星期'].isin([5, 6])
|
|
|
self.all_data['小时'] = self.all_data['时间'].dt.hour
|
|
|
|
|
|
|
|
|
self.all_data['情感分类'] = self.all_data['评分'].apply(self._classify_sentiment)
|
|
|
|
|
|
print(f"\n✅ 总计加载 {len(self.all_data)} 条有效评论")
|
|
|
print(f"📍 涵盖 {self.all_data['景点'].nunique()} 个景点")
|
|
|
print(f"📅 时间跨度: {self.all_data['时间'].min().date()} 至 {self.all_data['时间'].max().date()}")
|
|
|
|
|
|
return self.all_data
|
|
|
|
|
|
def _classify_sentiment(self, score):
|
|
|
"""情感分类"""
|
|
|
if pd.isna(score):
|
|
|
return '未知'
|
|
|
elif score >= 4.5:
|
|
|
return '非常满意'
|
|
|
elif score >= 4.0:
|
|
|
return '满意'
|
|
|
elif score >= 3.0:
|
|
|
return '一般'
|
|
|
elif score >= 2.0:
|
|
|
return '不满意'
|
|
|
else:
|
|
|
return '非常不满意'
|
|
|
|
|
|
def filter_data(self, selected_places):
|
|
|
"""根据选择的景点过滤数据"""
|
|
|
if selected_places and len(selected_places) > 0:
|
|
|
return self.all_data[self.all_data['景点'].isin(selected_places)].copy()
|
|
|
return self.all_data.copy()
|
|
|
|
|
|
def plot_advanced_rating_analysis(self, selected_places=None):
|
|
|
"""高级评分分析"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, 0])
|
|
|
df['评分'].hist(bins=30, ax=ax1, edgecolor='black', alpha=0.7, color='#3498db')
|
|
|
ax1.axvline(df['评分'].mean(), color='red', linestyle='--', linewidth=2,
|
|
|
label=f'均值: {df["评分"].mean():.2f}')
|
|
|
ax1.axvline(df['评分'].median(), color='green', linestyle='--', linewidth=2,
|
|
|
label=f'中位数: {df["评分"].median():.2f}')
|
|
|
ax1.set_xlabel('评分', fontsize=10)
|
|
|
ax1.set_ylabel('频数', fontsize=10)
|
|
|
ax1.set_title('评分分布直方图', fontsize=12, fontweight='bold')
|
|
|
ax1.legend()
|
|
|
ax1.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 1])
|
|
|
bp = ax2.boxplot(df['评分'].dropna(), vert=True, patch_artist=True,
|
|
|
boxprops=dict(facecolor='lightblue', alpha=0.7),
|
|
|
medianprops=dict(color='red', linewidth=2))
|
|
|
ax2.set_ylabel('评分', fontsize=10)
|
|
|
ax2.set_title('评分箱线图', fontsize=12, fontweight='bold')
|
|
|
ax2.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[0, 2])
|
|
|
score_bins = [0, 2, 3, 4, 5]
|
|
|
score_labels = ['差评\n(<2)', '中差评\n(2-3)', '中评\n(3-4)', '好评\n(4-5)']
|
|
|
df['评分段'] = pd.cut(df['评分'], bins=score_bins, labels=score_labels, include_lowest=True)
|
|
|
score_counts = df['评分段'].value_counts().sort_index()
|
|
|
colors_seg = ['#e74c3c', '#f39c12', '#3498db', '#2ecc71']
|
|
|
score_counts.plot(kind='bar', ax=ax3, color=colors_seg, alpha=0.8)
|
|
|
ax3.set_ylabel('评论数', fontsize=10)
|
|
|
ax3.set_title('评分分段统计', fontsize=12, fontweight='bold')
|
|
|
ax3.tick_params(axis='x', rotation=0)
|
|
|
for i, v in enumerate(score_counts.values):
|
|
|
ax3.text(i, v + max(score_counts.values) * 0.01, str(v), ha='center', fontsize=9)
|
|
|
ax3.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[1, :])
|
|
|
place_scores = df.groupby('景点').agg({
|
|
|
'评分': ['mean', 'count']
|
|
|
}).round(2)
|
|
|
place_scores.columns = ['平均评分', '评论数']
|
|
|
place_scores = place_scores.sort_values('平均评分', ascending=True)
|
|
|
|
|
|
colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(place_scores)))
|
|
|
bars = ax4.barh(range(len(place_scores)), place_scores['平均评分'], color=colors)
|
|
|
ax4.set_yticks(range(len(place_scores)))
|
|
|
ax4.set_yticklabels(place_scores.index, fontsize=9)
|
|
|
ax4.set_xlabel('平均评分', fontsize=10)
|
|
|
ax4.set_title('各景点平均评分对比', fontsize=12, fontweight='bold')
|
|
|
ax4.axvline(df['评分'].mean(), color='red', linestyle='--', alpha=0.5,
|
|
|
label=f'总体均值: {df["评分"].mean():.2f}')
|
|
|
ax4.legend()
|
|
|
ax4.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, (idx, row) in enumerate(place_scores.iterrows()):
|
|
|
ax4.text(row['平均评分'] + 0.05, i,
|
|
|
f"{row['平均评分']:.2f} ({int(row['评论数'])}条)",
|
|
|
va='center', fontsize=8)
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[2, 0])
|
|
|
ax5.scatter(df['评分'], df['评论长度'], alpha=0.3, s=20, c='#3498db')
|
|
|
ax5.set_xlabel('评分', fontsize=10)
|
|
|
ax5.set_ylabel('评论长度', fontsize=10)
|
|
|
ax5.set_title('评分与评论长度关系', fontsize=12, fontweight='bold')
|
|
|
ax5.grid(True, alpha=0.3)
|
|
|
|
|
|
z = np.polyfit(df['评分'].dropna(), df['评论长度'].dropna(), 1)
|
|
|
p = np.poly1d(z)
|
|
|
ax5.plot(df['评分'].sort_values(), p(df['评分'].sort_values()),
|
|
|
"r--", alpha=0.8, linewidth=2, label='趋势线')
|
|
|
ax5.legend()
|
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[2, 1])
|
|
|
from scipy import stats
|
|
|
|
|
|
density = stats.gaussian_kde(df['评分'].dropna())
|
|
|
xs = np.linspace(df['评分'].min(), df['评分'].max(), 200)
|
|
|
ys = density(xs)
|
|
|
|
|
|
ax6.plot(xs, ys, linewidth=2, color='#9b59b6')
|
|
|
ax6.fill_between(xs, 0, ys, alpha=0.3, color='#9b59b6')
|
|
|
ax6.set_xlabel('评分', fontsize=10)
|
|
|
ax6.set_ylabel('密度', fontsize=10)
|
|
|
ax6.set_title('评分密度分布', fontsize=12, fontweight='bold')
|
|
|
ax6.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax7 = fig.add_subplot(gs[2, 2])
|
|
|
sentiment_counts = df['情感分类'].value_counts()
|
|
|
colors_pie = ['#2ecc71', '#3498db', '#f39c12', '#e67e22', '#e74c3c']
|
|
|
|
|
|
|
|
|
explode = [0.05] * len(sentiment_counts)
|
|
|
|
|
|
wedges, texts, autotexts = ax7.pie(
|
|
|
sentiment_counts.values,
|
|
|
autopct='%1.1f%%',
|
|
|
colors=colors_pie[:len(sentiment_counts)],
|
|
|
startangle=90,
|
|
|
explode=explode,
|
|
|
textprops={'fontsize': 10},
|
|
|
pctdistance=0.85
|
|
|
)
|
|
|
ax7.legend(wedges, sentiment_counts.index,
|
|
|
loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
|
|
|
|
|
|
|
|
|
for autotext in autotexts:
|
|
|
autotext.set_color('white')
|
|
|
autotext.set_fontweight('bold')
|
|
|
autotext.set_fontsize(8)
|
|
|
|
|
|
ax7.set_title('情感分类分布', fontsize=12, fontweight='bold')
|
|
|
|
|
|
plt.suptitle('评分深度分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
def plot_time_trend_analysis(self, selected_places=None):
|
|
|
"""时间趋势深度分析"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.25)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, :])
|
|
|
df_monthly = df.set_index('时间').resample('ME').agg({
|
|
|
'评分': 'count',
|
|
|
'评论长度': 'mean'
|
|
|
})
|
|
|
df_monthly.columns = ['评论数', '平均评论长度']
|
|
|
|
|
|
ax1_twin = ax1.twinx()
|
|
|
line1 = ax1.plot(df_monthly.index, df_monthly['评论数'],
|
|
|
marker='o', linewidth=2, markersize=6, color='#3498db', label='评论数')
|
|
|
ax1.fill_between(df_monthly.index, df_monthly['评论数'], alpha=0.3, color='#3498db')
|
|
|
ax1.set_ylabel('评论数', fontsize=11, color='#3498db')
|
|
|
ax1.tick_params(axis='y', labelcolor='#3498db')
|
|
|
|
|
|
line2 = ax1_twin.plot(df_monthly.index, df_monthly['平均评论长度'],
|
|
|
marker='s', linewidth=2, markersize=6, color='#e74c3c', label='平均长度')
|
|
|
ax1_twin.set_ylabel('平均评论长度', fontsize=11, color='#e74c3c')
|
|
|
ax1_twin.tick_params(axis='y', labelcolor='#e74c3c')
|
|
|
|
|
|
lines = line1 + line2
|
|
|
labels = [l.get_label() for l in lines]
|
|
|
ax1.legend(lines, labels, loc='upper left')
|
|
|
ax1.set_title('月度评论数量与长度趋势', fontsize=12, fontweight='bold')
|
|
|
ax1.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[1, 0])
|
|
|
df_score_monthly = df.set_index('时间').resample('ME')['评分'].agg(['mean', 'std'])
|
|
|
ax2.plot(df_score_monthly.index, df_score_monthly['mean'],
|
|
|
marker='o', linewidth=2, color='#2ecc71', label='平均评分')
|
|
|
ax2.fill_between(df_score_monthly.index,
|
|
|
df_score_monthly['mean'] - df_score_monthly['std'],
|
|
|
df_score_monthly['mean'] + df_score_monthly['std'],
|
|
|
alpha=0.2, color='#2ecc71', label='±1标准差')
|
|
|
ax2.axhline(df['评分'].mean(), color='red', linestyle='--', alpha=0.5, label='总体均值')
|
|
|
ax2.set_ylabel('评分', fontsize=11)
|
|
|
ax2.set_title('月度评分趋势(含标准差)', fontsize=12, fontweight='bold')
|
|
|
ax2.legend()
|
|
|
ax2.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 1])
|
|
|
quarter_data = df.groupby('季度').agg({
|
|
|
'评分': ['mean', 'count']
|
|
|
}).round(2)
|
|
|
quarter_data.columns = ['平均评分', '评论数']
|
|
|
|
|
|
x = np.arange(len(quarter_data))
|
|
|
width = 0.35
|
|
|
bars1 = ax3.bar(x - width / 2, quarter_data['平均评分'], width,
|
|
|
label='平均评分', color='#3498db', alpha=0.8)
|
|
|
ax3_twin = ax3.twinx()
|
|
|
bars2 = ax3_twin.bar(x + width / 2, quarter_data['评论数'], width,
|
|
|
label='评论数', color='#e74c3c', alpha=0.8)
|
|
|
|
|
|
ax3.set_xlabel('季度', fontsize=11)
|
|
|
ax3.set_ylabel('平均评分', fontsize=11, color='#3498db')
|
|
|
ax3_twin.set_ylabel('评论数', fontsize=11, color='#e74c3c')
|
|
|
ax3.set_xticks(x)
|
|
|
ax3.set_xticklabels([f'Q{i}' for i in quarter_data.index])
|
|
|
ax3.set_title('季度对比分析', fontsize=12, fontweight='bold')
|
|
|
ax3.tick_params(axis='y', labelcolor='#3498db')
|
|
|
ax3_twin.tick_params(axis='y', labelcolor='#e74c3c')
|
|
|
ax3.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
lines = [bars1, bars2]
|
|
|
labels = [l.get_label() for l in lines]
|
|
|
ax3.legend(lines, labels, loc='upper left')
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[2, 0])
|
|
|
weekend_comparison = df.groupby('是否周末').agg({
|
|
|
'评分': ['mean', 'count'],
|
|
|
'评论长度': 'mean'
|
|
|
}).round(2)
|
|
|
weekend_comparison.index = ['工作日', '周末']
|
|
|
|
|
|
x_pos = np.arange(len(weekend_comparison))
|
|
|
bars = ax4.bar(x_pos, weekend_comparison[('评分', 'mean')],
|
|
|
color=['#3498db', '#e74c3c'], alpha=0.8)
|
|
|
ax4.set_xticks(x_pos)
|
|
|
ax4.set_xticklabels(weekend_comparison.index)
|
|
|
ax4.set_ylabel('平均评分', fontsize=11)
|
|
|
ax4.set_title('工作日 vs 周末评分对比', fontsize=12, fontweight='bold')
|
|
|
|
|
|
for i, bar in enumerate(bars):
|
|
|
height = bar.get_height()
|
|
|
count = weekend_comparison.iloc[i][('评分', 'count')]
|
|
|
ax4.text(bar.get_x() + bar.get_width() / 2., height,
|
|
|
f'{height:.2f}\n({int(count)}条)',
|
|
|
ha='center', va='bottom', fontsize=10)
|
|
|
ax4.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[2, 1])
|
|
|
hour_data = df.groupby('小时').size()
|
|
|
hours = range(24)
|
|
|
hour_counts = [hour_data.get(h, 0) for h in hours]
|
|
|
|
|
|
colors_hour = plt.cm.YlOrRd(np.array(hour_counts) / max(hour_counts))
|
|
|
bars = ax5.bar(hours, hour_counts, color=colors_hour, alpha=0.8)
|
|
|
ax5.set_xlabel('小时', fontsize=11)
|
|
|
ax5.set_ylabel('评论数', fontsize=11)
|
|
|
ax5.set_title('评论发布时段分布', fontsize=12, fontweight='bold')
|
|
|
ax5.set_xticks(range(0, 24, 3))
|
|
|
ax5.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
plt.suptitle('时间趋势深度分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
def generate_advanced_wordcloud(self, selected_places=None, rating_filter=None,
|
|
|
word_count=100):
|
|
|
"""高级词云分析"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
if rating_filter == "高分评论 (>=4)":
|
|
|
df = df[df['评分'] >= 4]
|
|
|
title_suffix = "高分评论"
|
|
|
colormap = 'Greens'
|
|
|
elif rating_filter == "低分评论 (<3)":
|
|
|
df = df[df['评分'] < 3]
|
|
|
title_suffix = "低分评论"
|
|
|
colormap = 'Reds'
|
|
|
else:
|
|
|
title_suffix = "全部评论"
|
|
|
colormap = 'viridis'
|
|
|
|
|
|
if len(df) == 0:
|
|
|
return self._create_empty_plot('没有符合条件的评论数据')
|
|
|
|
|
|
text = ' '.join(df['评论内容'].astype(str))
|
|
|
|
|
|
words = jieba.cut(text)
|
|
|
words_filtered = [w for w in words if len(w) > 1 and w not in STOPWORDS]
|
|
|
|
|
|
if len(words_filtered) == 0:
|
|
|
return self._create_empty_plot('没有足够的词汇生成词云')
|
|
|
|
|
|
word_freq = Counter(words_filtered)
|
|
|
top_words = word_freq.most_common(word_count)
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 8))
|
|
|
gs = fig.add_gridspec(1, 2, width_ratios=[2, 1], wspace=0.15)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0])
|
|
|
try:
|
|
|
wordcloud = WordCloud(
|
|
|
font_path=CHINESE_FONT,
|
|
|
width=1200,
|
|
|
height=600,
|
|
|
background_color='white',
|
|
|
max_words=word_count,
|
|
|
colormap=colormap,
|
|
|
relative_scaling=0.5,
|
|
|
min_font_size=10,
|
|
|
prefer_horizontal=0.7
|
|
|
).generate_from_frequencies(dict(top_words))
|
|
|
|
|
|
ax1.imshow(wordcloud, interpolation='bilinear')
|
|
|
ax1.axis('off')
|
|
|
ax1.set_title(f'词云图 - {title_suffix} (Top {word_count})',
|
|
|
fontsize=14, fontweight='bold', pad=20)
|
|
|
except Exception as e:
|
|
|
ax1.text(0.5, 0.5, f'词云生成失败: {str(e)}',
|
|
|
ha='center', va='center', fontsize=14, transform=ax1.transAxes)
|
|
|
ax1.axis('off')
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[1])
|
|
|
top_20 = top_words[:20]
|
|
|
words_list = [w[0] for w in top_20]
|
|
|
freqs_list = [w[1] for w in top_20]
|
|
|
|
|
|
colors = plt.cm.viridis(np.linspace(0, 1, len(words_list)))
|
|
|
ax2.barh(range(len(words_list)), freqs_list, color=colors, alpha=0.8)
|
|
|
ax2.set_yticks(range(len(words_list)))
|
|
|
ax2.set_yticklabels(words_list, fontsize=9)
|
|
|
ax2.invert_yaxis()
|
|
|
ax2.set_xlabel('频次', fontsize=10)
|
|
|
ax2.set_title('Top 20 高频词', fontsize=12, fontweight='bold')
|
|
|
ax2.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, v in enumerate(freqs_list):
|
|
|
ax2.text(v, i, f' {v}', va='center', fontsize=8)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
def extract_advanced_keywords(self, selected_places=None, top_n=30):
|
|
|
"""高级关键词提取(TF-IDF + TextRank)"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
if len(df) == 0:
|
|
|
return self._create_empty_plot('没有数据'), pd.DataFrame()
|
|
|
|
|
|
text = ' '.join(df['评论内容'].astype(str))
|
|
|
|
|
|
tfidf_keywords = jieba.analyse.extract_tags(text, topK=top_n, withWeight=True)
|
|
|
textrank_keywords = jieba.analyse.textrank(text, topK=top_n, withWeight=True)
|
|
|
|
|
|
positive_text = ' '.join(df[df['评分'] >= 4]['评论内容'].astype(str))
|
|
|
negative_text = ' '.join(df[df['评分'] < 3]['评论内容'].astype(str))
|
|
|
|
|
|
positive_keywords = jieba.analyse.extract_tags(positive_text, topK=15, withWeight=True) if positive_text else []
|
|
|
negative_keywords = jieba.analyse.extract_tags(negative_text, topK=15, withWeight=True) if negative_text else []
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.25)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, 0])
|
|
|
if tfidf_keywords:
|
|
|
words = [w[0] for w in tfidf_keywords[:20]]
|
|
|
weights = [w[1] for w in tfidf_keywords[:20]]
|
|
|
colors = plt.cm.plasma(np.linspace(0, 1, len(words)))
|
|
|
ax1.barh(range(len(words)), weights, color=colors, alpha=0.8)
|
|
|
ax1.set_yticks(range(len(words)))
|
|
|
ax1.set_yticklabels(words, fontsize=9)
|
|
|
ax1.invert_yaxis()
|
|
|
ax1.set_xlabel('TF-IDF权重', fontsize=10)
|
|
|
ax1.set_title('TF-IDF Top 20 关键词', fontsize=12, fontweight='bold')
|
|
|
ax1.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 1])
|
|
|
if textrank_keywords:
|
|
|
words = [w[0] for w in textrank_keywords[:20]]
|
|
|
weights = [w[1] for w in textrank_keywords[:20]]
|
|
|
colors = plt.cm.viridis(np.linspace(0, 1, len(words)))
|
|
|
ax2.barh(range(len(words)), weights, color=colors, alpha=0.8)
|
|
|
ax2.set_yticks(range(len(words)))
|
|
|
ax2.set_yticklabels(words, fontsize=9)
|
|
|
ax2.invert_yaxis()
|
|
|
ax2.set_xlabel('TextRank权重', fontsize=10)
|
|
|
ax2.set_title('TextRank Top 20 关键词', fontsize=12, fontweight='bold')
|
|
|
ax2.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 0])
|
|
|
if positive_keywords:
|
|
|
words = [w[0] for w in positive_keywords]
|
|
|
weights = [w[1] for w in positive_keywords]
|
|
|
colors = plt.cm.Greens(np.linspace(0.4, 1, len(words)))
|
|
|
ax3.barh(range(len(words)), weights, color=colors, alpha=0.8)
|
|
|
ax3.set_yticks(range(len(words)))
|
|
|
ax3.set_yticklabels(words, fontsize=9)
|
|
|
ax3.invert_yaxis()
|
|
|
ax3.set_xlabel('权重', fontsize=10)
|
|
|
ax3.set_title('正面评论关键词 (评分≥4)', fontsize=12, fontweight='bold')
|
|
|
ax3.grid(True, alpha=0.3, axis='x')
|
|
|
else:
|
|
|
ax3.text(0.5, 0.5, '无正面评论数据', ha='center', va='center',
|
|
|
transform=ax3.transAxes, fontsize=12)
|
|
|
ax3.axis('off')
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[1, 1])
|
|
|
if negative_keywords:
|
|
|
words = [w[0] for w in negative_keywords]
|
|
|
weights = [w[1] for w in negative_keywords]
|
|
|
colors = plt.cm.Reds(np.linspace(0.4, 1, len(words)))
|
|
|
ax4.barh(range(len(words)), weights, color=colors, alpha=0.8)
|
|
|
ax4.set_yticks(range(len(words)))
|
|
|
ax4.set_yticklabels(words, fontsize=9)
|
|
|
ax4.invert_yaxis()
|
|
|
ax4.set_xlabel('权重', fontsize=10)
|
|
|
ax4.set_title('负面评论关键词 (评分<3)', fontsize=12, fontweight='bold')
|
|
|
ax4.grid(True, alpha=0.3, axis='x')
|
|
|
else:
|
|
|
ax4.text(0.5, 0.5, '无负面评论数据', ha='center', va='center',
|
|
|
transform=ax4.transAxes, fontsize=12)
|
|
|
ax4.axis('off')
|
|
|
|
|
|
plt.suptitle('多维度关键词分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
df_keywords = pd.DataFrame({
|
|
|
'TF-IDF关键词': [w[0] for w in tfidf_keywords[:top_n]],
|
|
|
'TF-IDF权重': [round(w[1], 4) for w in tfidf_keywords[:top_n]],
|
|
|
'TextRank关键词': [w[0] for w in textrank_keywords[:top_n]],
|
|
|
'TextRank权重': [round(w[1], 4) for w in textrank_keywords[:top_n]],
|
|
|
})
|
|
|
|
|
|
return fig, df_keywords
|
|
|
|
|
|
def advanced_sentiment_analysis(self, selected_places=None):
|
|
|
"""高级情感分析 - 修复饼图重叠"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 10))
|
|
|
gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, 0])
|
|
|
sentiment_counts = df['情感分类'].value_counts()
|
|
|
colors_pie = ['#2ecc71', '#3498db', '#f39c12', '#e67e22', '#e74c3c']
|
|
|
|
|
|
|
|
|
explode = [0.05] * len(sentiment_counts)
|
|
|
|
|
|
wedges, texts, autotexts = ax1.pie(
|
|
|
sentiment_counts.values,
|
|
|
autopct='%1.1f%%',
|
|
|
colors=colors_pie[:len(sentiment_counts)],
|
|
|
startangle=90,
|
|
|
explode=explode,
|
|
|
textprops={'fontsize': 10},
|
|
|
pctdistance=0.85
|
|
|
)
|
|
|
ax1.legend(wedges, sentiment_counts.index,
|
|
|
loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
|
|
|
|
|
|
|
|
|
for autotext in autotexts:
|
|
|
autotext.set_color('white')
|
|
|
autotext.set_fontweight('bold')
|
|
|
autotext.set_fontsize(8)
|
|
|
|
|
|
ax1.set_title('整体情感分布', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 1:])
|
|
|
sentiment_by_place = pd.crosstab(df['景点'], df['情感分类'], normalize='index') * 100
|
|
|
sentiment_by_place = sentiment_by_place.reindex(columns=['非常满意', '满意', '一般', '不满意', '非常不满意'],
|
|
|
fill_value=0)
|
|
|
sentiment_by_place.plot(kind='barh', stacked=True, ax=ax2,
|
|
|
color=colors_pie[:len(sentiment_by_place.columns)],
|
|
|
alpha=0.8)
|
|
|
ax2.set_xlabel('百分比 (%)', fontsize=10)
|
|
|
ax2.set_ylabel('')
|
|
|
ax2.set_title('各景点情感分布对比', fontsize=12, fontweight='bold')
|
|
|
ax2.legend(title='情感', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
|
|
|
ax2.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, :])
|
|
|
df_monthly_sentiment = df.set_index('时间').groupby([pd.Grouper(freq='ME'), '情感分类']).size().unstack(
|
|
|
fill_value=0)
|
|
|
|
|
|
for col in ['非常满意', '满意', '一般', '不满意', '非常不满意']:
|
|
|
if col in df_monthly_sentiment.columns:
|
|
|
idx = ['非常满意', '满意', '一般', '不满意', '非常不满意'].index(col)
|
|
|
ax3.plot(df_monthly_sentiment.index, df_monthly_sentiment[col],
|
|
|
marker='o', label=col, linewidth=2, color=colors_pie[idx])
|
|
|
|
|
|
ax3.set_ylabel('评论数', fontsize=10)
|
|
|
ax3.set_xlabel('时间', fontsize=10)
|
|
|
ax3.set_title('情感趋势变化', fontsize=12, fontweight='bold')
|
|
|
ax3.legend(fontsize=9)
|
|
|
ax3.grid(True, alpha=0.3)
|
|
|
|
|
|
plt.suptitle('情感深度分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
stats_table = df.groupby('情感分类').agg({
|
|
|
'评分': ['count', 'mean'],
|
|
|
'评论长度': 'mean'
|
|
|
}).round(2)
|
|
|
stats_table.columns = ['数量', '平均评分', '平均评论长度']
|
|
|
stats_table['占比(%)'] = (stats_table['数量'] / len(df) * 100).round(1)
|
|
|
|
|
|
return fig, stats_table
|
|
|
|
|
|
def comprehensive_place_comparison(self):
|
|
|
"""综合景点对比 - 修复雷达图标签重叠"""
|
|
|
df = self.all_data
|
|
|
|
|
|
comparison = df.groupby('景点').agg({
|
|
|
'评分': ['mean', 'std', 'median', 'count'],
|
|
|
'评论长度': ['mean', 'median'],
|
|
|
'用户名': 'nunique'
|
|
|
}).round(2)
|
|
|
|
|
|
comparison.columns = ['平均评分', '评分标准差', '评分中位数', '评论总数',
|
|
|
'平均评论长度', '评论长度中位数', '独立用户数']
|
|
|
|
|
|
good_rate = df[df['评分'] >= 4].groupby('景点').size() / df.groupby('景点').size() * 100
|
|
|
comparison['好评率(%)'] = good_rate.round(1)
|
|
|
|
|
|
comparison['用户活跃度'] = (comparison['评论总数'] / comparison['独立用户数']).round(2)
|
|
|
|
|
|
comparison['综合得分'] = (
|
|
|
comparison['平均评分'] * 0.4 +
|
|
|
(comparison['好评率(%)'] / 20) +
|
|
|
(comparison['评论总数'] / comparison['评论总数'].max() * 5) * 0.2 +
|
|
|
(comparison['用户活跃度'] / comparison['用户活跃度'].max() * 5) * 0.1
|
|
|
).round(2)
|
|
|
|
|
|
comparison = comparison.sort_values('综合得分', ascending=False)
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.25)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, :], projection='polar')
|
|
|
top_places = comparison.head(6)
|
|
|
|
|
|
categories = ['平均评分', '好评率', '评论数', '用户活跃度', '评论长度']
|
|
|
N = len(categories)
|
|
|
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
|
|
angles += angles[:1]
|
|
|
|
|
|
ax1.set_theta_offset(np.pi / 2)
|
|
|
ax1.set_theta_direction(-1)
|
|
|
ax1.set_xticks(angles[:-1])
|
|
|
ax1.set_xticklabels(categories, fontsize=10)
|
|
|
|
|
|
for idx, (place, row) in enumerate(top_places.iterrows()):
|
|
|
values = [
|
|
|
row['平均评分'],
|
|
|
row['好评率(%)'] / 20,
|
|
|
row['评论总数'] / comparison['评论总数'].max() * 5,
|
|
|
row['用户活跃度'] / comparison['用户活跃度'].max() * 5,
|
|
|
row['平均评论长度'] / comparison['平均评论长度'].max() * 5
|
|
|
]
|
|
|
values += values[:1]
|
|
|
ax1.plot(angles, values, 'o-', linewidth=2, label=place, alpha=0.7)
|
|
|
ax1.fill(angles, values, alpha=0.15)
|
|
|
|
|
|
ax1.set_ylim(0, 5)
|
|
|
|
|
|
ax1.legend(loc='upper left', bbox_to_anchor=(1.15, 1.05), fontsize=9, framealpha=0.9)
|
|
|
ax1.set_title('Top 6 景点雷达图对比', fontsize=13, fontweight='bold', pad=25)
|
|
|
ax1.grid(True)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[1, 0])
|
|
|
colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(comparison)))
|
|
|
bars = ax2.barh(range(len(comparison)), comparison['综合得分'], color=colors, alpha=0.8)
|
|
|
ax2.set_yticks(range(len(comparison)))
|
|
|
ax2.set_yticklabels(comparison.index, fontsize=9)
|
|
|
ax2.invert_yaxis()
|
|
|
ax2.set_xlabel('综合得分', fontsize=10)
|
|
|
ax2.set_title('景点综合得分排名', fontsize=12, fontweight='bold')
|
|
|
ax2.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, v in enumerate(comparison['综合得分'].values):
|
|
|
ax2.text(v, i, f' {v:.2f}', va='center', fontsize=8)
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 1])
|
|
|
scatter = ax3.scatter(comparison['评论总数'], comparison['平均评分'],
|
|
|
s=comparison['独立用户数'] * 10, alpha=0.6,
|
|
|
c=comparison['好评率(%)'], cmap='RdYlGn',
|
|
|
edgecolors='black', linewidth=1)
|
|
|
|
|
|
for idx, place in enumerate(comparison.index):
|
|
|
ax3.annotate(place,
|
|
|
(comparison.iloc[idx]['评论总数'], comparison.iloc[idx]['平均评分']),
|
|
|
fontsize=8, alpha=0.7)
|
|
|
|
|
|
ax3.set_xlabel('评论总数', fontsize=10)
|
|
|
ax3.set_ylabel('平均评分', fontsize=10)
|
|
|
ax3.set_title('评分-评论数-用户数关系图\n(气泡大小=用户数, 颜色=好评率)',
|
|
|
fontsize=11, fontweight='bold')
|
|
|
ax3.grid(True, alpha=0.3)
|
|
|
|
|
|
cbar = plt.colorbar(scatter, ax=ax3)
|
|
|
cbar.set_label('好评率(%)', fontsize=9)
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[2, :])
|
|
|
heatmap_data = comparison[['平均评分', '评分标准差', '好评率(%)',
|
|
|
'评论总数', '用户活跃度', '综合得分']].T
|
|
|
|
|
|
heatmap_normalized = (heatmap_data - heatmap_data.min(axis=1).values.reshape(-1, 1)) / \
|
|
|
(heatmap_data.max(axis=1).values.reshape(-1, 1) -
|
|
|
heatmap_data.min(axis=1).values.reshape(-1, 1))
|
|
|
|
|
|
sns.heatmap(heatmap_normalized, annot=heatmap_data.round(1), fmt='g',
|
|
|
cmap='RdYlGn', ax=ax4, cbar_kws={'label': '标准化值'},
|
|
|
linewidths=0.5, linecolor='gray')
|
|
|
ax4.set_title('景点多维度指标热力图', fontsize=12, fontweight='bold', pad=15)
|
|
|
ax4.set_xlabel('')
|
|
|
ax4.set_ylabel('指标', fontsize=10)
|
|
|
|
|
|
plt.suptitle('景点综合对比分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
return fig, comparison
|
|
|
|
|
|
def user_profile_and_clustering(self, selected_places=None):
|
|
|
"""用户画像与聚类分析 - 修复饼图重叠"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
|
|
|
user_features = df.groupby('用户名').agg({
|
|
|
'评分': ['mean', 'std', 'count'],
|
|
|
'评论长度': ['mean', 'std'],
|
|
|
'景点': 'nunique',
|
|
|
'时间': lambda x: (x.max() - x.min()).days
|
|
|
}).reset_index()
|
|
|
|
|
|
user_features.columns = ['用户ID', '平均评分', '评分标准差', '评论次数',
|
|
|
'平均评论长度', '评论长度标准差', '访问景点数', '活跃天数']
|
|
|
|
|
|
|
|
|
user_features = user_features.fillna(0)
|
|
|
|
|
|
|
|
|
user_features['评分稳定性'] = 1 / (1 + user_features['评分标准差'])
|
|
|
user_features['是否活跃用户'] = (user_features['评论次数'] >= 3).astype(int)
|
|
|
user_features['是否忠诚用户'] = (user_features['访问景点数'] > 1).astype(int)
|
|
|
|
|
|
|
|
|
def classify_user_type(row):
|
|
|
if row['评论次数'] >= 5 and row['访问景点数'] > 1:
|
|
|
return '资深探索型'
|
|
|
elif row['评论次数'] >= 3 and row['平均评分'] >= 4:
|
|
|
return '满意常客型'
|
|
|
elif row['平均评分'] < 3:
|
|
|
return '挑剔批评型'
|
|
|
elif row['评论次数'] == 1:
|
|
|
return '偶然访客型'
|
|
|
else:
|
|
|
return '普通游客型'
|
|
|
|
|
|
user_features['用户类型'] = user_features.apply(classify_user_type, axis=1)
|
|
|
|
|
|
|
|
|
features_for_clustering = user_features[['平均评分', '评论次数', '平均评论长度',
|
|
|
'访问景点数', '评分稳定性']].copy()
|
|
|
|
|
|
scaler = StandardScaler()
|
|
|
features_scaled = scaler.fit_transform(features_for_clustering)
|
|
|
|
|
|
n_clusters = min(4, len(user_features))
|
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
|
user_features['聚类标签'] = kmeans.fit_predict(features_scaled)
|
|
|
|
|
|
|
|
|
pca = PCA(n_components=2)
|
|
|
features_pca = pca.fit_transform(features_scaled)
|
|
|
user_features['PCA1'] = features_pca[:, 0]
|
|
|
user_features['PCA2'] = features_pca[:, 1]
|
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 14))
|
|
|
gs = fig.add_gridspec(3, 3, hspace=0.35, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, 0])
|
|
|
type_counts = user_features['用户类型'].value_counts()
|
|
|
colors_type = ['#3498db', '#2ecc71', '#e74c3c', '#f39c12', '#9b59b6']
|
|
|
|
|
|
|
|
|
explode = [0.05] * len(type_counts)
|
|
|
|
|
|
wedges, texts, autotexts = ax1.pie(
|
|
|
type_counts.values,
|
|
|
autopct='%1.1f%%',
|
|
|
colors=colors_type[:len(type_counts)],
|
|
|
startangle=90,
|
|
|
explode=explode,
|
|
|
textprops={'fontsize': 10},
|
|
|
pctdistance=0.85
|
|
|
)
|
|
|
ax1.legend(wedges, type_counts.index,
|
|
|
loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
|
|
|
|
|
|
|
|
|
for autotext in autotexts:
|
|
|
autotext.set_color('white')
|
|
|
autotext.set_fontweight('bold')
|
|
|
autotext.set_fontsize(7)
|
|
|
|
|
|
ax1.set_title('用户类型分布', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 1:])
|
|
|
scatter = ax2.scatter(user_features['PCA1'], user_features['PCA2'],
|
|
|
c=user_features['聚类标签'], cmap='viridis',
|
|
|
s=100, alpha=0.6, edgecolors='black', linewidth=0.5)
|
|
|
|
|
|
|
|
|
centers_pca = pca.transform(scaler.transform(
|
|
|
user_features.groupby('聚类标签')[['平均评分', '评论次数', '平均评论长度',
|
|
|
'访问景点数', '评分稳定性']].mean()
|
|
|
))
|
|
|
ax2.scatter(centers_pca[:, 0], centers_pca[:, 1],
|
|
|
c='red', s=300, alpha=0.8, marker='*',
|
|
|
edgecolors='black', linewidth=2, label='聚类中心')
|
|
|
|
|
|
ax2.set_xlabel(f'主成分1 ({pca.explained_variance_ratio_[0]:.1%} 方差)', fontsize=10)
|
|
|
ax2.set_ylabel(f'主成分2 ({pca.explained_variance_ratio_[1]:.1%} 方差)', fontsize=10)
|
|
|
ax2.set_title('用户聚类分析(PCA降维)', fontsize=12, fontweight='bold')
|
|
|
ax2.legend()
|
|
|
ax2.grid(True, alpha=0.3)
|
|
|
|
|
|
cbar = plt.colorbar(scatter, ax=ax2)
|
|
|
cbar.set_label('聚类标签', fontsize=9)
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 0])
|
|
|
type_scores = user_features.groupby('用户类型')['平均评分'].mean().sort_values()
|
|
|
colors_bar = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(type_scores)))
|
|
|
type_scores.plot(kind='barh', ax=ax3, color=colors_bar, alpha=0.8)
|
|
|
ax3.set_xlabel('平均评分', fontsize=10)
|
|
|
ax3.set_title('各类型用户平均评分', fontsize=12, fontweight='bold')
|
|
|
ax3.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, v in enumerate(type_scores.values):
|
|
|
ax3.text(v, i, f' {v:.2f}', va='center', fontsize=9)
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[1, 1])
|
|
|
bins = [1, 2, 3, 5, 10, float('inf')]
|
|
|
labels = ['1次', '2次', '3-4次', '5-9次', '10次+']
|
|
|
user_features['活跃度分组'] = pd.cut(user_features['评论次数'], bins=bins,
|
|
|
labels=labels, right=False)
|
|
|
activity_counts = user_features['活跃度分组'].value_counts().sort_index()
|
|
|
|
|
|
colors_activity = ['#e74c3c', '#f39c12', '#3498db', '#2ecc71', '#9b59b6']
|
|
|
activity_counts.plot(kind='bar', ax=ax4, color=colors_activity, alpha=0.8)
|
|
|
ax4.set_xlabel('评论次数', fontsize=10)
|
|
|
ax4.set_ylabel('用户数', fontsize=10)
|
|
|
ax4.set_title('用户活跃度分布', fontsize=12, fontweight='bold')
|
|
|
ax4.tick_params(axis='x', rotation=45)
|
|
|
ax4.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
for i, v in enumerate(activity_counts.values):
|
|
|
ax4.text(i, v, str(v), ha='center', va='bottom', fontsize=9)
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[1, 2])
|
|
|
place_counts = user_features['访问景点数'].value_counts().sort_index()
|
|
|
ax5.bar(range(len(place_counts)), place_counts.values,
|
|
|
color=plt.cm.plasma(np.linspace(0, 1, len(place_counts))), alpha=0.8)
|
|
|
ax5.set_xticks(range(len(place_counts)))
|
|
|
ax5.set_xticklabels([f'{i}个' for i in place_counts.index], fontsize=9)
|
|
|
ax5.set_xlabel('访问景点数', fontsize=10)
|
|
|
ax5.set_ylabel('用户数', fontsize=10)
|
|
|
ax5.set_title('用户忠诚度分布', fontsize=12, fontweight='bold')
|
|
|
ax5.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[2, :], projection='polar')
|
|
|
|
|
|
cluster_profiles = user_features.groupby('聚类标签')[
|
|
|
['平均评分', '评论次数', '平均评论长度', '访问景点数', '评分稳定性']
|
|
|
].mean()
|
|
|
|
|
|
categories = ['平均评分', '评论次数', '评论长度', '访问景点数', '评分稳定性']
|
|
|
N = len(categories)
|
|
|
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
|
|
angles += angles[:1]
|
|
|
|
|
|
ax6.set_theta_offset(np.pi / 2)
|
|
|
ax6.set_theta_direction(-1)
|
|
|
ax6.set_xticks(angles[:-1])
|
|
|
ax6.set_xticklabels(categories, fontsize=10)
|
|
|
|
|
|
for idx, (cluster_id, row) in enumerate(cluster_profiles.iterrows()):
|
|
|
|
|
|
values = [
|
|
|
row['平均评分'],
|
|
|
row['评论次数'] / user_features['评论次数'].max() * 5,
|
|
|
row['平均评论长度'] / user_features['平均评论长度'].max() * 5,
|
|
|
row['访问景点数'] / user_features['访问景点数'].max() * 5,
|
|
|
row['评分稳定性'] / user_features['评分稳定性'].max() * 5
|
|
|
]
|
|
|
values += values[:1]
|
|
|
|
|
|
ax6.plot(angles, values, 'o-', linewidth=2,
|
|
|
label=f'聚类{cluster_id}', alpha=0.7)
|
|
|
ax6.fill(angles, values, alpha=0.15)
|
|
|
|
|
|
ax6.set_ylim(0, 5)
|
|
|
ax6.legend(loc='upper left', bbox_to_anchor=(1.15, 1.05), fontsize=10, framealpha=0.9)
|
|
|
ax6.set_title('各聚类用户特征画像', fontsize=13, fontweight='bold', pad=25)
|
|
|
ax6.grid(True)
|
|
|
|
|
|
plt.suptitle('用户画像与聚类分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
|
|
|
stats_table = pd.DataFrame({
|
|
|
'用户类型': type_counts.index,
|
|
|
'用户数': type_counts.values,
|
|
|
'占比(%)': (type_counts.values / len(user_features) * 100).round(1),
|
|
|
'平均评分': [user_features[user_features['用户类型'] == t]['平均评分'].mean().round(2)
|
|
|
for t in type_counts.index],
|
|
|
'平均评论次数': [user_features[user_features['用户类型'] == t]['评论次数'].mean().round(1)
|
|
|
for t in type_counts.index],
|
|
|
})
|
|
|
|
|
|
return fig, stats_table
|
|
|
|
|
|
def ml_satisfaction_predictor(self, selected_places=None):
|
|
|
"""🤖 机器学习满意度预测模型"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
if len(df) < 50:
|
|
|
return self._create_empty_plot('数据量不足(需要至少50条)'), pd.DataFrame()
|
|
|
|
|
|
|
|
|
df['是否高分'] = (df['评分'] >= 4).astype(int)
|
|
|
df['评论长度段'] = pd.cut(df['评论长度'], bins=[0, 50, 150, 500, float('inf')],
|
|
|
labels=[1, 2, 3, 4])
|
|
|
df['评论长度段'] = df['评论长度段'].astype(int)
|
|
|
|
|
|
|
|
|
df['是周末'] = df['是否周末'].astype(int)
|
|
|
df['月份编码'] = df['月份']
|
|
|
df['季度编码'] = df['季度']
|
|
|
|
|
|
|
|
|
place_encoding = {place: idx for idx, place in enumerate(df['景点'].unique())}
|
|
|
df['景点编码'] = df['景点'].map(place_encoding)
|
|
|
|
|
|
|
|
|
feature_cols = ['评论长度', '评论长度段', '是周末', '月份编码', '季度编码', '景点编码', '小时']
|
|
|
X = df[feature_cols].fillna(0)
|
|
|
y = df['是否高分']
|
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
|
|
|
|
|
|
|
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
|
|
|
rf_model.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
y_pred = rf_model.predict(X_test)
|
|
|
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
|
|
|
|
|
|
|
|
|
feature_importance = pd.DataFrame({
|
|
|
'特征': feature_cols,
|
|
|
'重要性': rf_model.feature_importances_
|
|
|
}).sort_values('重要性', ascending=False)
|
|
|
|
|
|
|
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, \
|
|
|
confusion_matrix
|
|
|
|
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
|
precision = precision_score(y_test, y_pred)
|
|
|
recall = recall_score(y_test, y_pred)
|
|
|
f1 = f1_score(y_test, y_pred)
|
|
|
auc = roc_auc_score(y_test, y_pred_proba)
|
|
|
cm = confusion_matrix(y_test, y_pred)
|
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(3, 3, hspace=0.35, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, :])
|
|
|
colors = plt.cm.viridis(np.linspace(0, 1, len(feature_importance)))
|
|
|
ax1.barh(range(len(feature_importance)), feature_importance['重要性'],
|
|
|
color=colors, alpha=0.8)
|
|
|
ax1.set_yticks(range(len(feature_importance)))
|
|
|
ax1.set_yticklabels(feature_importance['特征'], fontsize=10)
|
|
|
ax1.invert_yaxis()
|
|
|
ax1.set_xlabel('重要性得分', fontsize=11)
|
|
|
ax1.set_title('特征重要性排名(随机森林)', fontsize=13, fontweight='bold')
|
|
|
ax1.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, v in enumerate(feature_importance['重要性'].values):
|
|
|
ax1.text(v, i, f' {v:.3f}', va='center', fontsize=9)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[1, 0])
|
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
|
|
|
xticklabels=['低分', '高分'], yticklabels=['低分', '高分'])
|
|
|
ax2.set_xlabel('预测标签', fontsize=10)
|
|
|
ax2.set_ylabel('真实标签', fontsize=10)
|
|
|
ax2.set_title('混淆矩阵', fontsize=12, fontweight='bold')
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 1])
|
|
|
from sklearn.metrics import roc_curve
|
|
|
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
|
|
|
ax3.plot(fpr, tpr, linewidth=2, label=f'ROC (AUC={auc:.3f})', color='#2ecc71')
|
|
|
ax3.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机猜测')
|
|
|
ax3.fill_between(fpr, tpr, alpha=0.3, color='#2ecc71')
|
|
|
ax3.set_xlabel('假阳性率', fontsize=10)
|
|
|
ax3.set_ylabel('真阳性率', fontsize=10)
|
|
|
ax3.set_title('ROC曲线', fontsize=12, fontweight='bold')
|
|
|
ax3.legend()
|
|
|
ax3.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[1, 2])
|
|
|
ax4.hist(y_pred_proba[y_test == 0], bins=30, alpha=0.6, color='#e74c3c', label='实际低分')
|
|
|
ax4.hist(y_pred_proba[y_test == 1], bins=30, alpha=0.6, color='#2ecc71', label='实际高分')
|
|
|
ax4.axvline(0.5, color='black', linestyle='--', linewidth=2, label='决策阈值')
|
|
|
ax4.set_xlabel('预测为高分的概率', fontsize=10)
|
|
|
ax4.set_ylabel('样本数', fontsize=10)
|
|
|
ax4.set_title('预测概率分布', fontsize=12, fontweight='bold')
|
|
|
ax4.legend()
|
|
|
ax4.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[2, 0], projection='polar')
|
|
|
metrics_names = ['准确率', '精确率', '召回率', 'F1分数', 'AUC']
|
|
|
metrics_values = [accuracy, precision, recall, f1, auc]
|
|
|
|
|
|
N = len(metrics_names)
|
|
|
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
|
|
metrics_values += metrics_values[:1]
|
|
|
angles += angles[:1]
|
|
|
|
|
|
ax5.plot(angles, metrics_values, 'o-', linewidth=2, color='#3498db', label='模型表现')
|
|
|
ax5.fill(angles, metrics_values, alpha=0.25, color='#3498db')
|
|
|
ax5.set_xticks(angles[:-1])
|
|
|
ax5.set_xticklabels(metrics_names, fontsize=9)
|
|
|
ax5.set_ylim(0, 1)
|
|
|
ax5.set_title('模型评估指标', fontsize=12, fontweight='bold', pad=20)
|
|
|
ax5.legend(loc='upper right')
|
|
|
ax5.grid(True)
|
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[2, 1:])
|
|
|
df_with_pred = df.copy()
|
|
|
df_with_pred['预测高分概率'] = rf_model.predict_proba(X)[:, 1]
|
|
|
|
|
|
place_pred = df_with_pred.groupby('景点').agg({
|
|
|
'预测高分概率': 'mean',
|
|
|
'是否高分': 'mean'
|
|
|
}).sort_values('预测高分概率', ascending=True)
|
|
|
|
|
|
x = np.arange(len(place_pred))
|
|
|
width = 0.35
|
|
|
|
|
|
bars1 = ax6.barh(x - width / 2, place_pred['预测高分概率'] * 100, width,
|
|
|
label='预测满意度', color='#3498db', alpha=0.8)
|
|
|
bars2 = ax6.barh(x + width / 2, place_pred['是否高分'] * 100, width,
|
|
|
label='实际满意度', color='#2ecc71', alpha=0.8)
|
|
|
|
|
|
ax6.set_yticks(x)
|
|
|
ax6.set_yticklabels(place_pred.index, fontsize=9)
|
|
|
ax6.set_xlabel('满意度 (%)', fontsize=10)
|
|
|
ax6.set_title('各景点预测 vs 实际满意度', fontsize=12, fontweight='bold')
|
|
|
ax6.legend()
|
|
|
ax6.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
plt.suptitle('🤖 机器学习满意度预测模型', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
|
|
|
report_table = pd.DataFrame({
|
|
|
'评估指标': ['准确率', '精确率', '召回率', 'F1分数', 'AUC-ROC'],
|
|
|
'得分': [f'{accuracy:.3f}', f'{precision:.3f}', f'{recall:.3f}',
|
|
|
f'{f1:.3f}', f'{auc:.3f}'],
|
|
|
'说明': [
|
|
|
'预测正确的比例',
|
|
|
'预测为高分中真正高分的比例',
|
|
|
'实际高分中被预测出的比例',
|
|
|
'精确率和召回率的调和平均',
|
|
|
'模型区分能力(越接近1越好)'
|
|
|
]
|
|
|
})
|
|
|
|
|
|
return fig, report_table
|
|
|
|
|
|
def intelligent_tour_recommendation(self, user_preferences=None):
|
|
|
"""🎯 智能旅游路线推荐系统 - 修复雷达图标签"""
|
|
|
df = self.all_data
|
|
|
|
|
|
|
|
|
place_stats = df.groupby('景点').agg({
|
|
|
'评分': ['mean', 'std', 'count'],
|
|
|
'评论长度': 'mean',
|
|
|
'用户名': 'nunique'
|
|
|
}).reset_index()
|
|
|
|
|
|
place_stats.columns = ['景点', '平均评分', '评分标准差', '评论数', '平均评论长度', '独立用户数']
|
|
|
|
|
|
|
|
|
good_rate = df[df['评分'] >= 4].groupby('景点').size() / df.groupby('景点').size()
|
|
|
place_stats['好评率'] = place_stats['景点'].map(good_rate).fillna(0)
|
|
|
|
|
|
|
|
|
place_stats['热度得分'] = (
|
|
|
(place_stats['评论数'] / place_stats['评论数'].max()) * 0.6 +
|
|
|
(place_stats['独立用户数'] / place_stats['独立用户数'].max()) * 0.4
|
|
|
)
|
|
|
|
|
|
|
|
|
place_stats['质量得分'] = (
|
|
|
(place_stats['平均评分'] / 5) * 0.7 +
|
|
|
place_stats['好评率'] * 0.3
|
|
|
)
|
|
|
|
|
|
|
|
|
place_stats['推荐得分'] = (
|
|
|
place_stats['质量得分'] * 0.6 +
|
|
|
place_stats['热度得分'] * 0.3 +
|
|
|
(1 - place_stats['评分标准差'] / place_stats['评分标准差'].max()) * 0.1
|
|
|
)
|
|
|
|
|
|
place_stats = place_stats.sort_values('推荐得分', ascending=False)
|
|
|
|
|
|
|
|
|
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
|
|
|
|
|
clustering_features = place_stats[['平均评分', '好评率', '热度得分']].values
|
|
|
scaler = StandardScaler()
|
|
|
clustering_features_scaled = scaler.fit_transform(clustering_features)
|
|
|
|
|
|
linkage_matrix = linkage(clustering_features_scaled, method='ward')
|
|
|
place_stats['景点分组'] = fcluster(linkage_matrix, t=3, criterion='maxclust')
|
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(20, 14))
|
|
|
gs = fig.add_gridspec(3, 3, hspace=0.35, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, :])
|
|
|
colors = plt.cm.RdYlGn(np.linspace(0.3, 1, len(place_stats)))
|
|
|
bars = ax1.barh(range(len(place_stats)), place_stats['推荐得分'],
|
|
|
color=colors, alpha=0.8)
|
|
|
ax1.set_yticks(range(len(place_stats)))
|
|
|
ax1.set_yticklabels(place_stats['景点'], fontsize=10)
|
|
|
ax1.invert_yaxis()
|
|
|
ax1.set_xlabel('综合推荐得分', fontsize=11)
|
|
|
ax1.set_title('🏆 景点智能推荐排名', fontsize=13, fontweight='bold')
|
|
|
ax1.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
for i, (idx, row) in enumerate(place_stats.iterrows()):
|
|
|
ax1.text(row['推荐得分'], i, f" {row['推荐得分']:.3f}",
|
|
|
va='center', fontsize=9)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[1, 0], projection='3d')
|
|
|
scatter = ax2.scatter(place_stats['质量得分'],
|
|
|
place_stats['热度得分'],
|
|
|
place_stats['平均评分'],
|
|
|
c=place_stats['景点分组'],
|
|
|
cmap='Set2', s=200, alpha=0.7,
|
|
|
edgecolors='black', linewidth=1)
|
|
|
|
|
|
ax2.set_xlabel('质量得分', fontsize=9)
|
|
|
ax2.set_ylabel('热度得分', fontsize=9)
|
|
|
ax2.set_zlabel('平均评分', fontsize=9)
|
|
|
ax2.set_title('景点三维特征空间', fontsize=11, fontweight='bold')
|
|
|
|
|
|
for idx, row in place_stats.iterrows():
|
|
|
ax2.text(row['质量得分'], row['热度得分'], row['平均评分'],
|
|
|
row['景点'], fontsize=7, alpha=0.8)
|
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[1, 1:])
|
|
|
dendrogram(linkage_matrix, labels=place_stats['景点'].values, ax=ax3,
|
|
|
leaf_font_size=9, color_threshold=0)
|
|
|
ax3.set_xlabel('景点', fontsize=10)
|
|
|
ax3.set_ylabel('距离', fontsize=10)
|
|
|
ax3.set_title('景点层次聚类树状图', fontsize=12, fontweight='bold')
|
|
|
ax3.tick_params(axis='x', rotation=90)
|
|
|
ax3.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[2, 0], projection='polar')
|
|
|
|
|
|
group_profiles = place_stats.groupby('景点分组')[
|
|
|
['平均评分', '好评率', '热度得分', '质量得分', '推荐得分']
|
|
|
].mean()
|
|
|
|
|
|
categories = ['平均评分', '好评率', '热度', '质量', '推荐度']
|
|
|
N = len(categories)
|
|
|
angles = [n / float(N) * 2 * np.pi for n in range(N)]
|
|
|
angles += angles[:1]
|
|
|
|
|
|
ax4.set_theta_offset(np.pi / 2)
|
|
|
ax4.set_theta_direction(-1)
|
|
|
ax4.set_xticks(angles[:-1])
|
|
|
ax4.set_xticklabels(categories, fontsize=10)
|
|
|
|
|
|
colors_group = ['#e74c3c', '#3498db', '#2ecc71']
|
|
|
for idx, (group_id, row) in enumerate(group_profiles.iterrows()):
|
|
|
values = [
|
|
|
row['平均评分'] / 5,
|
|
|
row['好评率'],
|
|
|
row['热度得分'],
|
|
|
row['质量得分'],
|
|
|
row['推荐得分']
|
|
|
]
|
|
|
values += values[:1]
|
|
|
|
|
|
ax4.plot(angles, values, 'o-', linewidth=2,
|
|
|
label=f'分组{group_id}', color=colors_group[idx], alpha=0.7)
|
|
|
ax4.fill(angles, values, alpha=0.15, color=colors_group[idx])
|
|
|
|
|
|
ax4.set_ylim(0, 1)
|
|
|
ax4.legend(loc='upper left', bbox_to_anchor=(1.15, 1.05), fontsize=10, framealpha=0.9)
|
|
|
ax4.set_title('景点分组特征对比', fontsize=11, fontweight='bold', pad=20)
|
|
|
ax4.grid(True)
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[2, 1])
|
|
|
|
|
|
|
|
|
quality_median = place_stats['质量得分'].median()
|
|
|
heat_median = place_stats['热度得分'].median()
|
|
|
|
|
|
|
|
|
for idx, row in place_stats.iterrows():
|
|
|
color = colors_group[row['景点分组'] - 1]
|
|
|
ax5.scatter(row['质量得分'], row['热度得分'],
|
|
|
s=200, alpha=0.7, color=color, edgecolors='black', linewidth=1)
|
|
|
ax5.annotate(row['景点'], (row['质量得分'], row['热度得分']),
|
|
|
fontsize=8, alpha=0.8, ha='center')
|
|
|
|
|
|
|
|
|
ax5.axhline(heat_median, color='gray', linestyle='--', alpha=0.5)
|
|
|
ax5.axvline(quality_median, color='gray', linestyle='--', alpha=0.5)
|
|
|
|
|
|
|
|
|
ax5.text(0.05, 0.95, '高热度\n低质量', transform=ax5.transAxes,
|
|
|
fontsize=9, alpha=0.6, va='top')
|
|
|
ax5.text(0.95, 0.95, '高热度\n高质量\n⭐推荐', transform=ax5.transAxes,
|
|
|
fontsize=9, alpha=0.6, va='top', ha='right',
|
|
|
bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.3))
|
|
|
ax5.text(0.05, 0.05, '低热度\n低质量', transform=ax5.transAxes,
|
|
|
fontsize=9, alpha=0.6)
|
|
|
ax5.text(0.95, 0.05, '低热度\n高质量\n💎潜力', transform=ax5.transAxes,
|
|
|
fontsize=9, alpha=0.6, ha='right',
|
|
|
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
|
|
|
|
|
|
ax5.set_xlabel('质量得分', fontsize=10)
|
|
|
ax5.set_ylabel('热度得分', fontsize=10)
|
|
|
ax5.set_title('质量-热度矩阵分析', fontsize=12, fontweight='bold')
|
|
|
ax5.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[2, 2])
|
|
|
ax6.axis('off')
|
|
|
|
|
|
top_5 = place_stats.head(5)
|
|
|
|
|
|
|
|
|
y_pos = 0.95
|
|
|
for i, (idx, row) in enumerate(top_5.iterrows()):
|
|
|
|
|
|
rect = Rectangle((0.05, y_pos - 0.18), 0.9, 0.16,
|
|
|
facecolor=colors[i], alpha=0.2,
|
|
|
edgecolor='black', linewidth=1,
|
|
|
transform=ax6.transAxes)
|
|
|
ax6.add_patch(rect)
|
|
|
|
|
|
|
|
|
text = f"🏅 {i + 1}. {row['景点']}\n"
|
|
|
text += f" 推荐度: {row['推荐得分']:.3f} | "
|
|
|
text += f"评分: {row['平均评分']:.2f} | "
|
|
|
text += f"好评率: {row['好评率']:.1%}\n"
|
|
|
text += f" 热度: {'🔥' * int(row['热度得分'] * 5)}"
|
|
|
|
|
|
ax6.text(0.5, y_pos - 0.1, text,
|
|
|
transform=ax6.transAxes, fontsize=9,
|
|
|
ha='center', va='center',
|
|
|
bbox=dict(boxstyle='round', facecolor='white',
|
|
|
alpha=0.8, pad=0.3))
|
|
|
|
|
|
y_pos -= 0.19
|
|
|
|
|
|
ax6.set_title('🎯 Top 5 智能推荐路线', fontsize=12,
|
|
|
fontweight='bold', pad=10)
|
|
|
|
|
|
plt.suptitle('🎯 AI智能旅游路线推荐系统', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
|
|
|
recommendation_table = place_stats[['景点', '推荐得分', '平均评分', '好评率',
|
|
|
'热度得分', '质量得分', '景点分组']].copy()
|
|
|
recommendation_table['推荐等级'] = pd.cut(recommendation_table['推荐得分'],
|
|
|
bins=[0, 0.5, 0.7, 0.85, 1],
|
|
|
labels=['C级', 'B级', 'A级', 'S级'])
|
|
|
recommendation_table = recommendation_table.round(3)
|
|
|
|
|
|
return fig, recommendation_table
|
|
|
|
|
|
def topic_modeling_analysis(self, selected_places=None, n_topics=5):
|
|
|
"""📚 主题建模分析(LDA)"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
if len(df) < 20:
|
|
|
return self._create_empty_plot('数据量不足(需要至少20条)'), pd.DataFrame()
|
|
|
|
|
|
|
|
|
texts = df['评论内容'].astype(str).tolist()
|
|
|
|
|
|
|
|
|
processed_texts = []
|
|
|
for text in texts:
|
|
|
words = jieba.cut(text)
|
|
|
filtered_words = [w for w in words if len(w) > 1 and w not in STOPWORDS]
|
|
|
processed_texts.append(' '.join(filtered_words))
|
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer(max_features=100, max_df=0.8, min_df=2)
|
|
|
try:
|
|
|
tfidf_matrix = vectorizer.fit_transform(processed_texts)
|
|
|
except:
|
|
|
return self._create_empty_plot('文本数据不足以进行主题分析'), pd.DataFrame()
|
|
|
|
|
|
|
|
|
from sklearn.decomposition import LatentDirichletAllocation
|
|
|
|
|
|
lda_model = LatentDirichletAllocation(
|
|
|
n_components=n_topics,
|
|
|
random_state=42,
|
|
|
max_iter=50,
|
|
|
learning_method='batch'
|
|
|
)
|
|
|
|
|
|
lda_output = lda_model.fit_transform(tfidf_matrix)
|
|
|
|
|
|
|
|
|
feature_names = vectorizer.get_feature_names_out()
|
|
|
topics_words = []
|
|
|
|
|
|
for topic_idx, topic in enumerate(lda_model.components_):
|
|
|
top_indices = topic.argsort()[-10:][::-1]
|
|
|
top_words = [feature_names[i] for i in top_indices]
|
|
|
top_weights = [topic[i] for i in top_indices]
|
|
|
topics_words.append((top_words, top_weights))
|
|
|
|
|
|
|
|
|
df['主题编号'] = lda_output.argmax(axis=1)
|
|
|
df['主题概率'] = lda_output.max(axis=1)
|
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(20, 12))
|
|
|
gs = fig.add_gridspec(3, n_topics, hspace=0.4, wspace=0.3)
|
|
|
|
|
|
|
|
|
colors_topic = plt.cm.Set3(np.linspace(0, 1, n_topics))
|
|
|
|
|
|
for topic_idx in range(n_topics):
|
|
|
ax = fig.add_subplot(gs[0, topic_idx])
|
|
|
words, weights = topics_words[topic_idx]
|
|
|
|
|
|
ax.barh(range(len(words)), weights,
|
|
|
color=colors_topic[topic_idx], alpha=0.8)
|
|
|
ax.set_yticks(range(len(words)))
|
|
|
ax.set_yticklabels(words, fontsize=8)
|
|
|
ax.invert_yaxis()
|
|
|
ax.set_xlabel('权重', fontsize=9)
|
|
|
ax.set_title(f'主题 {topic_idx + 1}', fontsize=11, fontweight='bold')
|
|
|
ax.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
|
|
|
ax_dist = fig.add_subplot(gs[1, :])
|
|
|
topic_counts = df['主题编号'].value_counts().sort_index()
|
|
|
|
|
|
bars = ax_dist.bar(range(n_topics),
|
|
|
[topic_counts.get(i, 0) for i in range(n_topics)],
|
|
|
color=colors_topic, alpha=0.8, edgecolor='black', linewidth=1)
|
|
|
ax_dist.set_xticks(range(n_topics))
|
|
|
ax_dist.set_xticklabels([f'主题{i + 1}' for i in range(n_topics)], fontsize=10)
|
|
|
ax_dist.set_ylabel('评论数量', fontsize=11)
|
|
|
ax_dist.set_title('各主题评论分布', fontsize=13, fontweight='bold')
|
|
|
ax_dist.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
for i, bar in enumerate(bars):
|
|
|
height = bar.get_height()
|
|
|
ax_dist.text(bar.get_x() + bar.get_width() / 2., height,
|
|
|
f'{int(height)}\n({height / len(df) * 100:.1f}%)',
|
|
|
ha='center', va='bottom', fontsize=9)
|
|
|
|
|
|
|
|
|
ax_score = fig.add_subplot(gs[2, :2])
|
|
|
topic_scores = df.groupby('主题编号')['评分'].agg(['mean', 'std']).reset_index()
|
|
|
|
|
|
x = range(n_topics)
|
|
|
means = [topic_scores[topic_scores['主题编号'] == i]['mean'].values[0]
|
|
|
if i in topic_scores['主题编号'].values else 0
|
|
|
for i in range(n_topics)]
|
|
|
stds = [topic_scores[topic_scores['主题编号'] == i]['std'].values[0]
|
|
|
if i in topic_scores['主题编号'].values else 0
|
|
|
for i in range(n_topics)]
|
|
|
|
|
|
ax_score.bar(x, means, yerr=stds, color=colors_topic, alpha=0.8,
|
|
|
capsize=5, edgecolor='black', linewidth=1)
|
|
|
ax_score.set_xticks(x)
|
|
|
ax_score.set_xticklabels([f'主题{i + 1}' for i in range(n_topics)], fontsize=10)
|
|
|
ax_score.set_ylabel('平均评分', fontsize=11)
|
|
|
ax_score.set_title('各主题平均评分(含标准差)', fontsize=12, fontweight='bold')
|
|
|
ax_score.axhline(df['评分'].mean(), color='red', linestyle='--',
|
|
|
alpha=0.5, label='总体均值')
|
|
|
ax_score.legend()
|
|
|
ax_score.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
if df['景点'].nunique() > 1:
|
|
|
ax_heatmap = fig.add_subplot(gs[2, 2:])
|
|
|
topic_place = pd.crosstab(df['景点'], df['主题编号'], normalize='index') * 100
|
|
|
|
|
|
sns.heatmap(topic_place, annot=True, fmt='.1f', cmap='YlOrRd',
|
|
|
ax=ax_heatmap, cbar_kws={'label': '百分比(%)'},
|
|
|
linewidths=0.5, linecolor='gray')
|
|
|
ax_heatmap.set_xlabel('主题编号', fontsize=10)
|
|
|
ax_heatmap.set_ylabel('景点', fontsize=10)
|
|
|
ax_heatmap.set_title('景点-主题分布热力图', fontsize=12, fontweight='bold')
|
|
|
|
|
|
plt.suptitle(f'📚 主题建模分析(LDA,{n_topics}个主题)',
|
|
|
fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
|
|
|
topic_summary = []
|
|
|
for topic_idx in range(n_topics):
|
|
|
words, _ = topics_words[topic_idx]
|
|
|
topic_df = df[df['主题编号'] == topic_idx]
|
|
|
|
|
|
topic_summary.append({
|
|
|
'主题编号': f'主题{topic_idx + 1}',
|
|
|
'关键词': '、'.join(words[:5]),
|
|
|
'评论数': len(topic_df),
|
|
|
'占比(%)': round(len(topic_df) / len(df) * 100, 1),
|
|
|
'平均评分': round(topic_df['评分'].mean(), 2) if len(topic_df) > 0 else 0,
|
|
|
'主要景点': topic_df['景点'].mode()[0] if len(topic_df) > 0 else 'N/A'
|
|
|
})
|
|
|
|
|
|
summary_table = pd.DataFrame(topic_summary)
|
|
|
|
|
|
return fig, summary_table
|
|
|
|
|
|
def anomaly_detection_analysis(self, selected_places=None):
|
|
|
"""🚨 异常评论检测(Isolation Forest)"""
|
|
|
df = self.filter_data(selected_places)
|
|
|
|
|
|
if len(df) < 30:
|
|
|
return self._create_empty_plot('数据量不足(需要至少30条)'), pd.DataFrame()
|
|
|
|
|
|
|
|
|
features = pd.DataFrame({
|
|
|
'评分': df['评分'],
|
|
|
'评论长度': df['评论长度'],
|
|
|
'评分-均值差': df['评分'] - df['评分'].mean(),
|
|
|
'长度-均值差': df['评论长度'] - df['评论长度'].mean(),
|
|
|
'评分标准化': (df['评分'] - df['评分'].mean()) / df['评分'].std(),
|
|
|
'长度标准化': (df['评论长度'] - df['评论长度'].mean()) / df['评论长度'].std(),
|
|
|
})
|
|
|
|
|
|
|
|
|
from sklearn.ensemble import IsolationForest
|
|
|
|
|
|
iso_forest = IsolationForest(contamination=0.1, random_state=42)
|
|
|
anomaly_labels = iso_forest.fit_predict(features)
|
|
|
anomaly_scores = iso_forest.score_samples(features)
|
|
|
|
|
|
df['是否异常'] = (anomaly_labels == -1).astype(int)
|
|
|
df['异常得分'] = -anomaly_scores
|
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(18, 12))
|
|
|
gs = fig.add_gridspec(3, 3, hspace=0.35, wspace=0.3)
|
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, :2])
|
|
|
|
|
|
normal_data = df[df['是否异常'] == 0]
|
|
|
anomaly_data = df[df['是否异常'] == 1]
|
|
|
|
|
|
ax1.scatter(normal_data['评分'], normal_data['评论长度'],
|
|
|
alpha=0.5, s=50, c='#3498db', label='正常评论', edgecolors='none')
|
|
|
ax1.scatter(anomaly_data['评分'], anomaly_data['评论长度'],
|
|
|
alpha=0.8, s=100, c='#e74c3c', label='异常评论',
|
|
|
marker='X', edgecolors='black', linewidth=1)
|
|
|
|
|
|
ax1.set_xlabel('评分', fontsize=10)
|
|
|
ax1.set_ylabel('评论长度', fontsize=10)
|
|
|
ax1.set_title('异常评论检测(评分-长度空间)', fontsize=12, fontweight='bold')
|
|
|
ax1.legend(fontsize=10)
|
|
|
ax1.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 2])
|
|
|
ax2.hist(df['异常得分'], bins=30, color='#9b59b6', alpha=0.7, edgecolor='black')
|
|
|
threshold = df['异常得分'].quantile(0.9)
|
|
|
ax2.axvline(threshold, color='red', linestyle='--', linewidth=2,
|
|
|
label=f'90%分位数: {threshold:.3f}')
|
|
|
ax2.set_xlabel('异常得分', fontsize=10)
|
|
|
ax2.set_ylabel('频数', fontsize=10)
|
|
|
ax2.set_title('异常得分分布', fontsize=12, fontweight='bold')
|
|
|
ax2.legend()
|
|
|
ax2.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
if df['景点'].nunique() > 1:
|
|
|
ax3 = fig.add_subplot(gs[1, :])
|
|
|
place_anomaly = df.groupby('景点').agg({
|
|
|
'是否异常': ['sum', 'mean'],
|
|
|
'评分': 'count'
|
|
|
})
|
|
|
place_anomaly.columns = ['异常数', '异常率', '总数']
|
|
|
place_anomaly['异常率'] *= 100
|
|
|
place_anomaly = place_anomaly.sort_values('异常率', ascending=True)
|
|
|
|
|
|
x = np.arange(len(place_anomaly))
|
|
|
width = 0.35
|
|
|
|
|
|
ax3_twin = ax3.twinx()
|
|
|
bars1 = ax3.barh(x - width / 2, place_anomaly['异常率'], width,
|
|
|
label='异常率(%)', color='#e74c3c', alpha=0.8)
|
|
|
bars2 = ax3_twin.barh(x + width / 2, place_anomaly['异常数'], width,
|
|
|
label='异常数', color='#3498db', alpha=0.8)
|
|
|
|
|
|
ax3.set_yticks(x)
|
|
|
ax3.set_yticklabels(place_anomaly.index, fontsize=9)
|
|
|
ax3.set_xlabel('异常率 (%)', fontsize=10, color='#e74c3c')
|
|
|
ax3_twin.set_xlabel('异常评论数', fontsize=10, color='#3498db')
|
|
|
ax3.set_title('各景点异常评论分析', fontsize=12, fontweight='bold')
|
|
|
ax3.tick_params(axis='x', labelcolor='#e74c3c')
|
|
|
ax3_twin.tick_params(axis='x', labelcolor='#3498db')
|
|
|
|
|
|
lines = [bars1, bars2]
|
|
|
labels = [l.get_label() for l in lines]
|
|
|
ax3.legend(lines, labels, loc='upper right')
|
|
|
ax3.grid(True, alpha=0.3, axis='x')
|
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[2, 0])
|
|
|
comparison_data = pd.DataFrame({
|
|
|
'正常评论': [
|
|
|
normal_data['评分'].mean(),
|
|
|
normal_data['评论长度'].mean(),
|
|
|
normal_data['评分'].std(),
|
|
|
normal_data['评论长度'].std()
|
|
|
],
|
|
|
'异常评论': [
|
|
|
anomaly_data['评分'].mean(),
|
|
|
anomaly_data['评论长度'].mean(),
|
|
|
anomaly_data['评分'].std(),
|
|
|
anomaly_data['评论长度'].std()
|
|
|
]
|
|
|
}, index=['平均评分', '平均长度', '评分标准差', '长度标准差'])
|
|
|
|
|
|
comparison_data.plot(kind='bar', ax=ax4, color=['#3498db', '#e74c3c'], alpha=0.8)
|
|
|
ax4.set_ylabel('数值', fontsize=10)
|
|
|
ax4.set_title('正常 vs 异常评论特征对比', fontsize=11, fontweight='bold')
|
|
|
ax4.tick_params(axis='x', rotation=45)
|
|
|
ax4.legend(fontsize=9)
|
|
|
ax4.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[2, 1])
|
|
|
box_data = [normal_data['评分'].dropna(), anomaly_data['评分'].dropna()]
|
|
|
bp = ax5.boxplot(box_data, labels=['正常评论', '异常评论'],
|
|
|
patch_artist=True)
|
|
|
bp['boxes'][0].set_facecolor('#3498db')
|
|
|
bp['boxes'][1].set_facecolor('#e74c3c')
|
|
|
for element in ['whiskers', 'fliers', 'means', 'medians', 'caps']:
|
|
|
plt.setp(bp[element], color='black', linewidth=1.5)
|
|
|
|
|
|
ax5.set_ylabel('评分', fontsize=10)
|
|
|
ax5.set_title('评分分布对比(箱线图)', fontsize=11, fontweight='bold')
|
|
|
ax5.grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[2, 2])
|
|
|
|
|
|
|
|
|
def categorize_anomaly(row):
|
|
|
if row['是否异常'] == 0:
|
|
|
return '正常'
|
|
|
elif row['评分'] < 2 and row['评论长度'] < 20:
|
|
|
return '低分短评'
|
|
|
elif row['评分'] >= 4.5 and row['评论长度'] < 20:
|
|
|
return '高分短评'
|
|
|
elif row['评论长度'] > df['评论长度'].quantile(0.95):
|
|
|
return '超长评论'
|
|
|
else:
|
|
|
return '其他异常'
|
|
|
|
|
|
df['异常类型'] = df.apply(categorize_anomaly, axis=1)
|
|
|
anomaly_type_counts = df[df['是否异常'] == 1]['异常类型'].value_counts()
|
|
|
|
|
|
colors_anomaly = ['#e74c3c', '#f39c12', '#9b59b6', '#e67e22']
|
|
|
|
|
|
|
|
|
explode = [0.05] * len(anomaly_type_counts)
|
|
|
|
|
|
wedges, texts, autotexts = ax6.pie(
|
|
|
anomaly_type_counts.values,
|
|
|
autopct='%1.1f%%',
|
|
|
colors=colors_anomaly,
|
|
|
startangle=90,
|
|
|
explode=explode,
|
|
|
textprops={'fontsize': 10},
|
|
|
pctdistance=0.85
|
|
|
)
|
|
|
ax6.legend(wedges, anomaly_type_counts.index,
|
|
|
loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
|
|
|
|
|
|
|
|
|
for autotext in autotexts:
|
|
|
autotext.set_color('white')
|
|
|
autotext.set_fontweight('bold')
|
|
|
autotext.set_fontsize(8)
|
|
|
|
|
|
ax6.set_title('异常类型分布', fontsize=11, fontweight='bold')
|
|
|
|
|
|
plt.suptitle('🚨 异常评论智能检测分析', fontsize=16, fontweight='bold', y=0.995)
|
|
|
|
|
|
|
|
|
anomaly_report = pd.DataFrame({
|
|
|
'指标': ['总评论数', '异常评论数', '异常率(%)', '---',
|
|
|
'正常评论平均分', '异常评论平均分', '---',
|
|
|
'正常评论平均长度', '异常评论平均长度'],
|
|
|
'数值': [
|
|
|
len(df),
|
|
|
len(anomaly_data),
|
|
|
round(len(anomaly_data) / len(df) * 100, 2),
|
|
|
'---',
|
|
|
round(normal_data['评分'].mean(), 2),
|
|
|
round(anomaly_data['评分'].mean(), 2) if len(anomaly_data) > 0 else 0,
|
|
|
'---',
|
|
|
round(normal_data['评论长度'].mean(), 1),
|
|
|
round(anomaly_data['评论长度'].mean(), 1) if len(anomaly_data) > 0 else 0
|
|
|
]
|
|
|
})
|
|
|
|
|
|
return fig, anomaly_report
|
|
|
|
|
|
def _create_empty_plot(self, message):
|
|
|
"""创建空图表"""
|
|
|
fig, ax = plt.subplots(figsize=(14, 7))
|
|
|
ax.text(0.5, 0.5, message, ha='center', va='center',
|
|
|
fontsize=16, transform=ax.transAxes)
|
|
|
ax.axis('off')
|
|
|
return fig
|
|
|
|
|
|
|
|
|
def create_interface():
|
|
|
print("\n正在初始化分析器...")
|
|
|
analyzer = TourismDataAnalyzer('data')
|
|
|
|
|
|
all_places = sorted(analyzer.all_data['景点'].unique().tolist())
|
|
|
|
|
|
print(f"✓ 界面初始化完成!发现 {len(all_places)} 个景点")
|
|
|
print(f"✓ 使用字体: {CHINESE_FONT or '系统默认'}\n")
|
|
|
|
|
|
with gr.Blocks(title="AI旅游智能分析系统", theme=gr.themes.Soft()) as demo:
|
|
|
gr.Markdown("""
|
|
|
# 🤖 AI驱动的旅游计划评估与智能决策系统
|
|
|
### 基于机器学习的深度数据分析与智能推荐平台
|
|
|
""")
|
|
|
|
|
|
gr.Markdown(f"""
|
|
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
|
padding: 25px; border-radius: 15px; color: white; margin: 20px 0;
|
|
|
box-shadow: 0 4px 15px rgba(0,0,0,0.2);">
|
|
|
<h3 style="margin: 0;">📊 数据概览</h3>
|
|
|
<p style="margin: 15px 0; font-size: 18px;">
|
|
|
总评论数: <b>{len(analyzer.all_data)}</b> 条 |
|
|
|
景点数: <b>{len(all_places)}</b> 个 |
|
|
|
用户数: <b>{analyzer.all_data['用户名'].nunique()}</b> 人
|
|
|
</p>
|
|
|
<p style="margin: 5px 0;">
|
|
|
时间范围: {analyzer.all_data['时间'].min().date()} 至 {analyzer.all_data['时间'].max().date()}
|
|
|
</p>
|
|
|
<p style="margin-top: 15px; font-size: 14px; opacity: 0.9;">
|
|
|
🚀 集成机器学习算法:随机森林、层次聚类、主题建模(LDA)、异常检测(Isolation Forest)
|
|
|
</p>
|
|
|
</div>
|
|
|
""")
|
|
|
|
|
|
with gr.Row():
|
|
|
place_selector = gr.CheckboxGroup(
|
|
|
choices=all_places,
|
|
|
label="🎯 选择要分析的景点(不选则分析全部数据)",
|
|
|
value=[],
|
|
|
elem_id="place_selector"
|
|
|
)
|
|
|
|
|
|
with gr.Tabs():
|
|
|
|
|
|
with gr.Tab("📋 数据集预览"):
|
|
|
gr.Markdown("""
|
|
|
### 📊 原始数据预览
|
|
|
- **隐私保护**: 用户名已自动匿名化处理
|
|
|
- **数据采样**: 随机展示部分评论数据
|
|
|
- **字段说明**: 用户ID | 景点 | 评分 | 时间 | 评论内容 | 长度 | 情感分类
|
|
|
""")
|
|
|
|
|
|
with gr.Row():
|
|
|
sample_size = gr.Slider(
|
|
|
minimum=50,
|
|
|
maximum=500,
|
|
|
value=100,
|
|
|
step=50,
|
|
|
label="📊 预览数据量"
|
|
|
)
|
|
|
refresh_btn = gr.Button("🔄 刷新数据", variant="secondary", size="sm")
|
|
|
|
|
|
preview_table = gr.DataFrame(
|
|
|
value=analyzer.get_preview_data(100),
|
|
|
label="数据集预览(用户名已匿名)",
|
|
|
wrap=True,
|
|
|
max_height=600,
|
|
|
interactive=False
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown(f"""
|
|
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
|
padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
|
|
<h3 style="margin: 0;">📊 总评论数</h3>
|
|
|
<p style="font-size: 32px; font-weight: bold; margin: 10px 0;">
|
|
|
{len(analyzer.all_data):,}
|
|
|
</p>
|
|
|
</div>
|
|
|
""")
|
|
|
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown(f"""
|
|
|
<div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
|
|
padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
|
|
<h3 style="margin: 0;">🏞️ 景点数量</h3>
|
|
|
<p style="font-size: 32px; font-weight: bold; margin: 10px 0;">
|
|
|
{analyzer.all_data['景点'].nunique()}
|
|
|
</p>
|
|
|
</div>
|
|
|
""")
|
|
|
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown(f"""
|
|
|
<div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
|
|
|
padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
|
|
<h3 style="margin: 0;">👥 用户数量</h3>
|
|
|
<p style="font-size: 32px; font-weight: bold; margin: 10px 0;">
|
|
|
{analyzer.all_data['用户名'].nunique():,}
|
|
|
</p>
|
|
|
</div>
|
|
|
""")
|
|
|
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown(f"""
|
|
|
<div style="background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
|
|
|
padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
|
|
<h3 style="margin: 0;">⭐ 平均评分</h3>
|
|
|
<p style="font-size: 32px; font-weight: bold; margin: 10px 0;">
|
|
|
{analyzer.all_data['评分'].mean():.2f}
|
|
|
</p>
|
|
|
</div>
|
|
|
""")
|
|
|
|
|
|
|
|
|
refresh_btn.click(
|
|
|
fn=lambda n: analyzer.get_preview_data(n),
|
|
|
inputs=[sample_size],
|
|
|
outputs=[preview_table]
|
|
|
)
|
|
|
|
|
|
sample_size.change(
|
|
|
fn=lambda n: analyzer.get_preview_data(n),
|
|
|
inputs=[sample_size],
|
|
|
outputs=[preview_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("⭐ 评分深度分析"):
|
|
|
gr.Markdown("### 多维度评分分析,含分布、箱线图、景点对比等")
|
|
|
rating_btn = gr.Button("📊 生成评分深度分析", variant="primary", size="lg")
|
|
|
rating_plot = gr.Plot(label="评分分析图表")
|
|
|
|
|
|
rating_btn.click(
|
|
|
fn=lambda x: analyzer.plot_advanced_rating_analysis(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[rating_plot]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("📅 时间趋势分析"):
|
|
|
gr.Markdown("### 评论数量、评分、季节性、工作日/周末全面分析")
|
|
|
time_btn = gr.Button("📈 生成时间趋势分析", variant="primary", size="lg")
|
|
|
time_plot = gr.Plot(label="时间趋势图表")
|
|
|
|
|
|
time_btn.click(
|
|
|
fn=lambda x: analyzer.plot_time_trend_analysis(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[time_plot]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("☁️ 高级词云分析"):
|
|
|
gr.Markdown("### 可视化高频词汇,支持按评分筛选,含词频统计")
|
|
|
with gr.Row():
|
|
|
rating_filter = gr.Radio(
|
|
|
choices=["全部评论", "高分评论 (>=4)", "低分评论 (<3)"],
|
|
|
value="全部评论",
|
|
|
label="评论筛选"
|
|
|
)
|
|
|
word_count = gr.Slider(50, 200, value=100, step=10, label="词云词汇数量")
|
|
|
|
|
|
wordcloud_btn = gr.Button("☁️ 生成高级词云", variant="primary", size="lg")
|
|
|
wordcloud_plot = gr.Plot(label="词云与词频图")
|
|
|
|
|
|
wordcloud_btn.click(
|
|
|
fn=lambda x, y, z: analyzer.generate_advanced_wordcloud(
|
|
|
x if x else None,
|
|
|
None if y == "全部评论" else y,
|
|
|
z
|
|
|
),
|
|
|
inputs=[place_selector, rating_filter, word_count],
|
|
|
outputs=[wordcloud_plot]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("🔑 多维度关键词提取"):
|
|
|
gr.Markdown("### TF-IDF + TextRank双算法,正负面关键词对比")
|
|
|
keyword_num = gr.Slider(10, 50, value=30, step=5, label="关键词数量")
|
|
|
keyword_btn = gr.Button("🔍 提取多维度关键词", variant="primary", size="lg")
|
|
|
|
|
|
keyword_plot = gr.Plot(label="关键词分析图表")
|
|
|
keyword_table = gr.DataFrame(label="关键词详细列表")
|
|
|
|
|
|
keyword_btn.click(
|
|
|
fn=lambda x, n: analyzer.extract_advanced_keywords(x if x else None, n),
|
|
|
inputs=[place_selector, keyword_num],
|
|
|
outputs=[keyword_plot, keyword_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("😊 情感深度分析"):
|
|
|
gr.Markdown("### 五级情感分类,景点对比,时间趋势全覆盖")
|
|
|
sentiment_btn = gr.Button("💭 生成情感深度分析", variant="primary", size="lg")
|
|
|
|
|
|
sentiment_plot = gr.Plot(label="情感分析图表")
|
|
|
sentiment_table = gr.DataFrame(label="情感统计数据")
|
|
|
|
|
|
sentiment_btn.click(
|
|
|
fn=lambda x: analyzer.advanced_sentiment_analysis(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[sentiment_plot, sentiment_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("🏆 景点综合对比"):
|
|
|
gr.Markdown("### 雷达图、热力图、散点图多角度对比景点表现")
|
|
|
comparison_btn = gr.Button("📊 生成景点综合对比", variant="primary", size="lg")
|
|
|
|
|
|
comparison_plot = gr.Plot(label="景点对比图表")
|
|
|
comparison_table = gr.DataFrame(label="景点详细数据", wrap=True)
|
|
|
|
|
|
comparison_btn.click(
|
|
|
fn=lambda: analyzer.comprehensive_place_comparison(),
|
|
|
inputs=[],
|
|
|
outputs=[comparison_plot, comparison_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("👥 用户画像与行为分析"):
|
|
|
gr.Markdown("### K-means聚类、PCA降维、用户类型识别(隐私保护)")
|
|
|
user_profile_btn = gr.Button("🔍 生成用户画像分析", variant="primary", size="lg")
|
|
|
|
|
|
user_profile_plot = gr.Plot(label="用户画像与聚类图表")
|
|
|
user_profile_table = gr.DataFrame(label="用户类型统计")
|
|
|
|
|
|
user_profile_btn.click(
|
|
|
fn=lambda x: analyzer.user_profile_and_clustering(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[user_profile_plot, user_profile_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("🤖 AI满意度预测"):
|
|
|
gr.Markdown("""
|
|
|
### 🎯 基于随机森林的智能满意度预测模型
|
|
|
- **算法**: Random Forest Classifier
|
|
|
- **特征**: 评论长度、时间特征、景点编码等7个维度
|
|
|
- **输出**: 特征重要性、ROC曲线、混淆矩阵、预测 vs 实际对比
|
|
|
""")
|
|
|
ml_predict_btn = gr.Button("🚀 训练并预测满意度", variant="primary", size="lg")
|
|
|
|
|
|
ml_predict_plot = gr.Plot(label="机器学习模型分析")
|
|
|
ml_predict_table = gr.DataFrame(label="模型评估报告")
|
|
|
|
|
|
ml_predict_btn.click(
|
|
|
fn=lambda x: analyzer.ml_satisfaction_predictor(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[ml_predict_plot, ml_predict_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("🎯 AI智能推荐"):
|
|
|
gr.Markdown("""
|
|
|
### 🗺️ 基于层次聚类的智能旅游路线推荐
|
|
|
- **算法**: Hierarchical Clustering + 多维度评分
|
|
|
- **维度**: 质量得分、热度得分、稳定性等
|
|
|
- **输出**: Top 5推荐路线、质量-热度矩阵、景点分组
|
|
|
""")
|
|
|
recommend_btn = gr.Button("🎁 生成智能推荐路线", variant="primary", size="lg")
|
|
|
|
|
|
recommend_plot = gr.Plot(label="智能推荐分析")
|
|
|
recommend_table = gr.DataFrame(label="推荐排名详情")
|
|
|
|
|
|
recommend_btn.click(
|
|
|
fn=lambda: analyzer.intelligent_tour_recommendation(),
|
|
|
inputs=[],
|
|
|
outputs=[recommend_plot, recommend_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("📚 主题挖掘"):
|
|
|
gr.Markdown("""
|
|
|
### 📖 基于LDA的评论主题建模分析
|
|
|
- **算法**: Latent Dirichlet Allocation (LDA)
|
|
|
- **预处理**: TF-IDF向量化 + 中文分词
|
|
|
- **输出**: 主题关键词、主题分布、主题-评分关系
|
|
|
""")
|
|
|
n_topics = gr.Slider(3, 8, value=5, step=1, label="主题数量")
|
|
|
topic_btn = gr.Button("📚 提取评论主题", variant="primary", size="lg")
|
|
|
|
|
|
topic_plot = gr.Plot(label="主题建模分析")
|
|
|
topic_table = gr.DataFrame(label="主题摘要")
|
|
|
|
|
|
topic_btn.click(
|
|
|
fn=lambda x, n: analyzer.topic_modeling_analysis(x if x else None, n),
|
|
|
inputs=[place_selector, n_topics],
|
|
|
outputs=[topic_plot, topic_table]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Tab("🚨 异常检测"):
|
|
|
gr.Markdown("""
|
|
|
### 🔍 基于Isolation Forest的异常评论检测
|
|
|
- **算法**: Isolation Forest(孤立森林)
|
|
|
- **检测维度**: 评分异常、长度异常、综合异常
|
|
|
- **输出**: 异常评论标注、异常类型分类、景点异常率对比
|
|
|
""")
|
|
|
anomaly_btn = gr.Button("🔍 检测异常评论", variant="primary", size="lg")
|
|
|
|
|
|
anomaly_plot = gr.Plot(label="异常检测分析")
|
|
|
anomaly_table = gr.DataFrame(label="异常检测报告")
|
|
|
|
|
|
anomaly_btn.click(
|
|
|
fn=lambda x: analyzer.anomaly_detection_analysis(x if x else None),
|
|
|
inputs=[place_selector],
|
|
|
outputs=[anomaly_plot, anomaly_table]
|
|
|
)
|
|
|
|
|
|
gr.Markdown("""
|
|
|
---
|
|
|
<div style="text-align: center; color: #666; padding: 20px;">
|
|
|
<p style="font-size: 16px; margin-bottom: 10px;">
|
|
|
🎨 <b>技术栈</b>: Gradio + Pandas + Scikit-learn + Matplotlib + Seaborn + Jieba
|
|
|
</p>
|
|
|
<p style="font-size: 14px;">
|
|
|
🤖 <b>ML算法</b>: Random Forest | K-means | LDA | Isolation Forest | Hierarchical Clustering
|
|
|
</p>
|
|
|
<p style="font-size: 12px; margin-top: 10px; opacity: 0.7;">
|
|
|
💾 数据来源: {count} 条真实用户评论 | 🔒 隐私保护: 匿名化处理
|
|
|
</p>
|
|
|
</div>
|
|
|
""".format(count=len(analyzer.all_data)))
|
|
|
|
|
|
return demo
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("\n" + "=" * 70)
|
|
|
print("🚀 启动 AI旅游智能分析系统 v3.0 - 机器学习增强版")
|
|
|
print("=" * 70 + "\n")
|
|
|
|
|
|
demo = create_interface()
|
|
|
demo.launch(
|
|
|
share=False,
|
|
|
server_name="0.0.0.0",
|
|
|
server_port=7860,
|
|
|
show_error=True
|
|
|
) |