first commit
This commit is contained in:
494
aicodes/module1_dashboard/cardio_dashboard.py
Normal file
494
aicodes/module1_dashboard/cardio_dashboard.py
Normal file
@@ -0,0 +1,494 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
CardioAI - Module 1: 数据可视化仪表板
|
||||
心血管疾病数据清洗、特征工程与交互式可视化
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import plotly.express as px
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# ============================================
|
||||
# 配置与常量
|
||||
# ============================================
|
||||
CODE_ROOT = Path(r"E:\project_ai\claude_project1\aicodes")
|
||||
DATA_PATH = Path(r"E:\project_ai\claude_project1\data\心血管疾病.xlsx")
|
||||
|
||||
# 类别映射字典
|
||||
CHOLESTEROL_MAP = {
|
||||
1: "正常",
|
||||
2: "高于正常",
|
||||
3: "远高于正常"
|
||||
}
|
||||
|
||||
GLUC_MAP = {
|
||||
1: "正常",
|
||||
2: "高于正常",
|
||||
3: "远高于正常"
|
||||
}
|
||||
|
||||
GENDER_MAP = {
|
||||
1: "女性",
|
||||
2: "男性"
|
||||
}
|
||||
|
||||
CARDIO_MAP = {
|
||||
0: "无疾病",
|
||||
1: "有疾病"
|
||||
}
|
||||
|
||||
|
||||
# ============================================
|
||||
# 数据加载与清洗函数
|
||||
# ============================================
|
||||
@st.cache_data
|
||||
def load_data():
|
||||
"""
|
||||
加载心血管疾病数据
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 原始数据
|
||||
"""
|
||||
try:
|
||||
df = pd.read_excel(DATA_PATH, engine='openpyxl')
|
||||
return df
|
||||
except Exception as e:
|
||||
st.error(f"数据加载失败: {str(e)}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
@st.cache_data
|
||||
def clean_and_process_data(df):
|
||||
"""
|
||||
数据清洗与特征工程
|
||||
|
||||
处理步骤:
|
||||
1. 将age(天)转换为年
|
||||
2. 计算BMI
|
||||
3. 删除血压异常值
|
||||
4. 类别转换
|
||||
5. 创建BMI分类
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): 原始数据
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 处理后的数据
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
df_clean = df.copy()
|
||||
|
||||
# 1. 年龄转换: 天 -> 年 (四舍五入)
|
||||
df_clean['age_years'] = (df_clean['age'] / 365.25).round().astype(int)
|
||||
|
||||
# 2. 计算BMI: weight / (height/100)^2
|
||||
df_clean['bmi'] = df_clean['weight'] / ((df_clean['height'] / 100) ** 2)
|
||||
|
||||
# 3. 异常值处理
|
||||
|
||||
# 删除舒张压 >= 收缩压的记录 (生理上不可能)
|
||||
df_clean = df_clean[df_clean['ap_lo'] < df_clean['ap_hi']]
|
||||
|
||||
# 删除血压极端异常值 (收缩压范围: [90, 250], 舒张压范围: [60, 150])
|
||||
df_clean = df_clean[
|
||||
(df_clean['ap_hi'] >= 90) & (df_clean['ap_hi'] <= 250) &
|
||||
(df_clean['ap_lo'] >= 60) & (df_clean['ap_lo'] <= 150)
|
||||
]
|
||||
|
||||
# 4. 类别转换
|
||||
df_clean['cholesterol_desc'] = df_clean['cholesterol'].map(CHOLESTEROL_MAP)
|
||||
df_clean['gluc_desc'] = df_clean['gluc'].map(GLUC_MAP)
|
||||
df_clean['gender_desc'] = df_clean['gender'].map(GENDER_MAP)
|
||||
df_clean['cardio_desc'] = df_clean['cardio'].map(CARDIO_MAP)
|
||||
|
||||
# 5. 创建BMI分类
|
||||
def categorize_bmi(bmi):
|
||||
if bmi < 18.5:
|
||||
return "体重过轻"
|
||||
elif bmi < 24:
|
||||
return "正常体重"
|
||||
elif bmi < 28:
|
||||
return "超重"
|
||||
else:
|
||||
return "肥胖"
|
||||
|
||||
df_clean['bmi_category'] = df_clean['bmi'].apply(categorize_bmi)
|
||||
|
||||
return df_clean
|
||||
|
||||
|
||||
# ============================================
|
||||
# 筛选函数
|
||||
# ============================================
|
||||
def apply_filters(df, age_range, gender_filter, cardio_filter):
|
||||
"""
|
||||
根据用户选择的筛选条件过滤数据
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): 处理后的数据
|
||||
age_range (tuple): 年龄范围 (min, max)
|
||||
gender_filter (list): 性别筛选列表
|
||||
cardio_filter (list): 心血管疾病状态筛选列表
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 筛选后的数据
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
df_filtered = df.copy()
|
||||
|
||||
# 年龄范围筛选
|
||||
df_filtered = df_filtered[
|
||||
df_filtered['age_years'].between(age_range[0], age_range[1])
|
||||
]
|
||||
|
||||
# 性别筛选
|
||||
if gender_filter:
|
||||
df_filtered = df_filtered[df_filtered['gender_desc'].isin(gender_filter)]
|
||||
|
||||
# 心血管疾病状态筛选
|
||||
if cardio_filter:
|
||||
df_filtered = df_filtered[df_filtered['cardio_desc'].isin(cardio_filter)]
|
||||
|
||||
return df_filtered
|
||||
|
||||
|
||||
# ============================================
|
||||
# 统计指标函数
|
||||
# ============================================
|
||||
def calculate_statistics(df):
|
||||
"""
|
||||
计算统计指标
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): 筛选后的数据
|
||||
|
||||
Returns:
|
||||
dict: 包含总记录数和风险率的字典
|
||||
"""
|
||||
if df.empty:
|
||||
return {"total_records": 0, "risk_rate": 0.0}
|
||||
|
||||
total_records = len(df)
|
||||
disease_count = df['cardio'].sum()
|
||||
risk_rate = (disease_count / total_records * 100) if total_records > 0 else 0
|
||||
|
||||
return {
|
||||
"total_records": total_records,
|
||||
"risk_rate": risk_rate
|
||||
}
|
||||
|
||||
|
||||
# ============================================
|
||||
# 可视化图表函数
|
||||
# ============================================
|
||||
def plot_age_distribution(df):
|
||||
"""
|
||||
绘制年龄分布直方图 (按cardio区分)
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): 数据
|
||||
|
||||
Returns:
|
||||
plotly.graph_objects.Figure: 年龄分布图
|
||||
"""
|
||||
if df.empty:
|
||||
return None
|
||||
|
||||
fig = px.histogram(
|
||||
df,
|
||||
x='age_years',
|
||||
color='cardio_desc',
|
||||
nbins=30,
|
||||
title='年龄分布 (按心血管疾病状态)',
|
||||
labels={'age_years': '年龄 (岁)', 'count': '人数'},
|
||||
color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'},
|
||||
barmode='overlay'
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title="年龄 (岁)",
|
||||
yaxis_title="人数",
|
||||
legend_title="疾病状态",
|
||||
hovermode='x unified'
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def plot_bmi_vs_cardio(df):
|
||||
"""
|
||||
绘制BMI分类对心血管疾病影响的堆叠柱状图
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): 数据
|
||||
|
||||
Returns:
|
||||
plotly.graph_objects.Figure: BMI与疾病关系图
|
||||
"""
|
||||
if df.empty:
|
||||
return None
|
||||
|
||||
# 计算每个BMI分类的疾病比例
|
||||
bmi_cardio = df.groupby(['bmi_category', 'cardio_desc']).size().reset_index(name='count')
|
||||
|
||||
# 确保BMI分类顺序正确
|
||||
bmi_order = ["体重过轻", "正常体重", "超重", "肥胖"]
|
||||
bmi_cardio['bmi_category'] = pd.Categorical(
|
||||
bmi_cardio['bmi_category'],
|
||||
categories=bmi_order,
|
||||
ordered=True
|
||||
)
|
||||
bmi_cardio = bmi_cardio.sort_values('bmi_category')
|
||||
|
||||
fig = px.bar(
|
||||
bmi_cardio,
|
||||
x='bmi_category',
|
||||
y='count',
|
||||
color='cardio_desc',
|
||||
title='BMI分类与心血管疾病关系',
|
||||
labels={'bmi_category': 'BMI分类', 'count': '人数'},
|
||||
color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'},
|
||||
category_orders={'bmi_category': bmi_order}
|
||||
)
|
||||
|
||||
fig.update_layout(
|
||||
xaxis_title="BMI分类",
|
||||
yaxis_title="人数",
|
||||
legend_title="疾病状态",
|
||||
barmode='stack'
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
# ============================================
|
||||
# 主应用程序
|
||||
# ============================================
|
||||
def main():
|
||||
"""Streamlit 主应用程序"""
|
||||
|
||||
# 页面配置
|
||||
st.set_page_config(
|
||||
page_title="CardioAI - 心血管疾病数据分析仪表板",
|
||||
page_icon="❤️",
|
||||
layout="wide",
|
||||
initial_sidebar_state="expanded"
|
||||
)
|
||||
|
||||
# 标题与描述
|
||||
st.title("❤️ CardioAI - 心血管疾病智能辅助系统")
|
||||
st.markdown("### Module 1: 数据可视化仪表板")
|
||||
st.markdown("---")
|
||||
|
||||
# 数据加载
|
||||
with st.spinner("正在加载数据..."):
|
||||
raw_data = load_data()
|
||||
|
||||
if raw_data.empty:
|
||||
st.error("无法加载数据,请检查数据路径是否正确。")
|
||||
st.stop()
|
||||
|
||||
# 数据清洗与特征工程
|
||||
processed_data = clean_and_process_data(raw_data)
|
||||
|
||||
if processed_data.empty:
|
||||
st.warning("数据清洗后无有效记录,请检查数据质量。")
|
||||
st.stop()
|
||||
|
||||
# ============================================
|
||||
# 侧边栏 - 筛选器
|
||||
# ============================================
|
||||
st.sidebar.header("🔍 数据筛选")
|
||||
|
||||
# 年龄范围滑块
|
||||
age_min = int(processed_data['age_years'].min())
|
||||
age_max = int(processed_data['age_years'].max())
|
||||
age_range = st.sidebar.slider(
|
||||
"年龄范围 (岁)",
|
||||
min_value=age_min,
|
||||
max_value=age_max,
|
||||
value=(age_min, age_max),
|
||||
step=1
|
||||
)
|
||||
|
||||
# 性别多选框
|
||||
gender_options = processed_data['gender_desc'].unique().tolist()
|
||||
gender_filter = st.sidebar.multiselect(
|
||||
"性别",
|
||||
options=gender_options,
|
||||
default=gender_options
|
||||
)
|
||||
|
||||
# 心血管疾病状态多选框
|
||||
cardio_options = processed_data['cardio_desc'].unique().tolist()
|
||||
cardio_filter = st.sidebar.multiselect(
|
||||
"心血管疾病状态",
|
||||
options=cardio_options,
|
||||
default=cardio_options
|
||||
)
|
||||
|
||||
st.sidebar.markdown("---")
|
||||
st.sidebar.markdown("### 📊 数据概览")
|
||||
st.sidebar.markdown(f"- 原始记录数: **{len(raw_data):,}**")
|
||||
st.sidebar.markdown(f"- 清洗后记录数: **{len(processed_data):,}**")
|
||||
st.sidebar.markdown(f"- 数据清洗率: **{(1 - len(processed_data)/len(raw_data))*100:.2f}%**")
|
||||
|
||||
# ============================================
|
||||
# 应用筛选条件
|
||||
# ============================================
|
||||
filtered_data = apply_filters(
|
||||
processed_data,
|
||||
age_range,
|
||||
gender_filter,
|
||||
cardio_filter
|
||||
)
|
||||
|
||||
# ============================================
|
||||
# 主页 - 统计指标
|
||||
# ============================================
|
||||
st.subheader("📈 筛选结果统计")
|
||||
|
||||
stats = calculate_statistics(filtered_data)
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
st.metric(
|
||||
label="筛选后总记录数",
|
||||
value=f"{stats['total_records']:,}",
|
||||
delta=None
|
||||
)
|
||||
|
||||
with col2:
|
||||
# 根据风险率设置颜色
|
||||
risk_rate = stats['risk_rate']
|
||||
risk_color = "normal" if risk_rate < 50 else "inverse"
|
||||
|
||||
st.metric(
|
||||
label="心血管疾病风险率",
|
||||
value=f"{risk_rate:.2f}%",
|
||||
delta=None,
|
||||
delta_color=risk_color
|
||||
)
|
||||
|
||||
st.markdown("---")
|
||||
|
||||
# ============================================
|
||||
# 图表展示
|
||||
# ============================================
|
||||
st.subheader("📊 数据可视化")
|
||||
|
||||
# 第一行: 年龄分布图
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
age_fig = plot_age_distribution(filtered_data)
|
||||
if age_fig:
|
||||
st.plotly_chart(age_fig, use_container_width=True)
|
||||
|
||||
with col2:
|
||||
# 添加性别分布饼图
|
||||
gender_dist = filtered_data['gender_desc'].value_counts().reset_index()
|
||||
gender_dist.columns = ['性别', '人数']
|
||||
gender_pie = px.pie(
|
||||
gender_dist,
|
||||
values='人数',
|
||||
names='性别',
|
||||
title='性别分布',
|
||||
color_discrete_sequence=['#3498db', '#e91e63']
|
||||
)
|
||||
st.plotly_chart(gender_pie, use_container_width=True)
|
||||
|
||||
# 第二行: BMI与疾病关系图
|
||||
st.markdown("### BMI分类与心血管疾病风险")
|
||||
bmi_fig = plot_bmi_vs_cardio(filtered_data)
|
||||
if bmi_fig:
|
||||
st.plotly_chart(bmi_fig, use_container_width=True)
|
||||
|
||||
# 第三行: 其他因素分析
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
# 胆固醇分布
|
||||
chol_cardio = filtered_data.groupby(['cholesterol_desc', 'cardio_desc']).size().reset_index(name='count')
|
||||
chol_fig = px.bar(
|
||||
chol_cardio,
|
||||
x='cholesterol_desc',
|
||||
y='count',
|
||||
color='cardio_desc',
|
||||
title='胆固醇水平与心血管疾病',
|
||||
labels={'cholesterol_desc': '胆固醇水平', 'count': '人数'},
|
||||
color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'}
|
||||
)
|
||||
st.plotly_chart(chol_fig, use_container_width=True)
|
||||
|
||||
with col2:
|
||||
# 血糖分布
|
||||
gluc_cardio = filtered_data.groupby(['gluc_desc', 'cardio_desc']).size().reset_index(name='count')
|
||||
gluc_order = ["正常", "高于正常", "远高于正常"]
|
||||
gluc_cardio['gluc_desc'] = pd.Categorical(
|
||||
gluc_cardio['gluc_desc'],
|
||||
categories=gluc_order,
|
||||
ordered=True
|
||||
)
|
||||
gluc_cardio = gluc_cardio.sort_values('gluc_desc')
|
||||
|
||||
gluc_fig = px.bar(
|
||||
gluc_cardio,
|
||||
x='gluc_desc',
|
||||
y='count',
|
||||
color='cardio_desc',
|
||||
title='血糖水平与心血管疾病',
|
||||
labels={'gluc_desc': '血糖水平', 'count': '人数'},
|
||||
category_orders={'gluc_desc': gluc_order},
|
||||
color_discrete_map={'无疾病': '#2ecc71', '有疾病': '#e74c3c'}
|
||||
)
|
||||
st.plotly_chart(gluc_fig, use_container_width=True)
|
||||
|
||||
# ============================================
|
||||
# 数据表格预览
|
||||
# ============================================
|
||||
st.markdown("---")
|
||||
st.subheader("📋 数据预览")
|
||||
|
||||
display_columns = [
|
||||
'age_years', 'gender_desc', 'height', 'weight', 'bmi', 'bmi_category',
|
||||
'ap_hi', 'ap_lo', 'cholesterol_desc', 'gluc_desc',
|
||||
'smoke', 'alco', 'active', 'cardio_desc'
|
||||
]
|
||||
|
||||
column_rename = {
|
||||
'age_years': '年龄(岁)',
|
||||
'gender_desc': '性别',
|
||||
'height': '身高(cm)',
|
||||
'weight': '体重(kg)',
|
||||
'bmi': 'BMI',
|
||||
'bmi_category': 'BMI分类',
|
||||
'ap_hi': '收缩压',
|
||||
'ap_lo': '舒张压',
|
||||
'cholesterol_desc': '胆固醇',
|
||||
'gluc_desc': '血糖',
|
||||
'smoke': '吸烟',
|
||||
'alco': '饮酒',
|
||||
'active': '运动',
|
||||
'cardio_desc': '心血管疾病'
|
||||
}
|
||||
|
||||
if not filtered_data.empty:
|
||||
display_df = filtered_data[display_columns].copy()
|
||||
display_df = display_df.rename(columns=column_rename)
|
||||
st.dataframe(display_df.head(100), use_container_width=True, height=400)
|
||||
|
||||
|
||||
# ============================================
|
||||
# 程序入口
|
||||
# ============================================
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user