- Module 1: Dashboard for cardiovascular disease data visualization - Module 2: Machine learning predictor with Flask API - Module 3: Voice assistant with DeepSeek and CosyVoice integration - Add .gitignore for proper file exclusion - Update requirements and documentation Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
354 lines
12 KiB
Python
354 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
CardioAI - Cardiovascular Disease Dashboard
|
|
Streamlit application for data cleaning, feature engineering, and interactive visualization.
|
|
"""
|
|
|
|
import streamlit as st
|
|
import pandas as pd
|
|
import numpy as np
|
|
import plotly.express as px
|
|
import plotly.graph_objects as go
|
|
from plotly.subplots import make_subplots
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
# Set page configuration
|
|
st.set_page_config(
|
|
page_title="CardioAI - Cardiovascular Disease Dashboard",
|
|
page_icon="❤️",
|
|
layout="wide",
|
|
initial_sidebar_state="expanded"
|
|
)
|
|
|
|
# Constants
|
|
DATA_PATH = "/Users/anthony/ai_lesson/ai_zhangzhongshan/心血管疾病.xlsx"
|
|
|
|
@st.cache_data
|
|
def load_and_clean_data():
|
|
"""
|
|
Load and clean cardiovascular disease data.
|
|
Returns cleaned DataFrame.
|
|
"""
|
|
# Load data
|
|
try:
|
|
df = pd.read_excel(DATA_PATH, engine='openpyxl')
|
|
st.sidebar.success(f"数据加载成功!原始记录数: {len(df):,}")
|
|
except Exception as e:
|
|
st.error(f"数据加载失败: {e}")
|
|
return pd.DataFrame()
|
|
|
|
# Standardize column names (handle potential naming variations)
|
|
# Common column name variations in cardiovascular datasets
|
|
column_mapping = {
|
|
'age(天)': 'age',
|
|
'cholestero1': 'cholesterol',
|
|
'cholesterol': 'cholesterol',
|
|
'gluc': 'gluc',
|
|
'ap_hi': 'ap_hi',
|
|
'ap_lo': 'ap_lo',
|
|
'cardio': 'cardio',
|
|
'gender': 'gender',
|
|
'height': 'height',
|
|
'weight': 'weight',
|
|
'smoke': 'smoke',
|
|
'alco': 'alco',
|
|
'active': 'active'
|
|
}
|
|
|
|
# Rename columns to standardized names
|
|
df_columns = df.columns.tolist()
|
|
for col in df_columns:
|
|
for key, value in column_mapping.items():
|
|
if key.lower() in col.lower():
|
|
df.rename(columns={col: value}, inplace=True)
|
|
break
|
|
|
|
# Feature Engineering
|
|
|
|
# 1. Convert age (in days) to years (age_years)
|
|
if 'age' in df.columns:
|
|
df['age_years'] = np.round(df['age'] / 365.25).astype(int)
|
|
|
|
# 2. Calculate BMI
|
|
if 'height' in df.columns and 'weight' in df.columns:
|
|
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
|
df['bmi'] = df['bmi'].round(2)
|
|
|
|
# 3. Handle blood pressure outliers
|
|
# Remove records where diastolic (ap_lo) >= systolic (ap_hi)
|
|
if 'ap_hi' in df.columns and 'ap_lo' in df.columns:
|
|
df = df[df['ap_lo'] < df['ap_hi']]
|
|
|
|
# Remove extreme blood pressure values
|
|
systolic_mask = (df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)
|
|
diastolic_mask = (df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)
|
|
df = df[systolic_mask & diastolic_mask]
|
|
|
|
# 4. Convert cholesterol and glucose to descriptive strings
|
|
if 'cholesterol' in df.columns:
|
|
cholesterol_map = {
|
|
1: "Normal",
|
|
2: "Above Normal",
|
|
3: "Well Above Normal"
|
|
}
|
|
df['cholesterol_str'] = df['cholesterol'].map(cholesterol_map)
|
|
|
|
if 'gluc' in df.columns:
|
|
glucose_map = {
|
|
1: "Normal",
|
|
2: "Above Normal",
|
|
3: "Well Above Normal"
|
|
}
|
|
df['gluc_str'] = df['gluc'].map(glucose_map)
|
|
|
|
# 5. Create BMI categories
|
|
if 'bmi' in df.columns:
|
|
def categorize_bmi(bmi):
|
|
if bmi < 18.5:
|
|
return "Underweight"
|
|
elif 18.5 <= bmi < 25:
|
|
return "Normal"
|
|
elif 25 <= bmi < 30:
|
|
return "Overweight"
|
|
else:
|
|
return "Obese"
|
|
|
|
df['bmi_category'] = df['bmi'].apply(categorize_bmi)
|
|
|
|
return df
|
|
|
|
def create_filters(df):
|
|
"""Create interactive filters in sidebar."""
|
|
st.sidebar.header("📊 数据筛选器")
|
|
|
|
# Age range slider
|
|
if 'age_years' in df.columns:
|
|
min_age = int(df['age_years'].min())
|
|
max_age = int(df['age_years'].max())
|
|
age_range = st.sidebar.slider(
|
|
"选择年龄范围 (岁)",
|
|
min_value=min_age,
|
|
max_value=max_age,
|
|
value=(min_age, max_age)
|
|
)
|
|
else:
|
|
age_range = (0, 100)
|
|
|
|
# Gender filter (if exists)
|
|
gender_filter = None
|
|
if 'gender' in df.columns:
|
|
gender_options = df['gender'].unique()
|
|
gender_options_sorted = sorted(gender_options)
|
|
gender_selected = st.sidebar.multiselect(
|
|
"选择性别",
|
|
options=gender_options_sorted,
|
|
default=gender_options_sorted
|
|
)
|
|
gender_filter = gender_selected
|
|
|
|
# Cardiovascular disease filter
|
|
cardio_filter = None
|
|
if 'cardio' in df.columns:
|
|
cardio_options = df['cardio'].unique()
|
|
cardio_options_sorted = sorted(cardio_options)
|
|
cardio_selected = st.sidebar.multiselect(
|
|
"心血管疾病状态",
|
|
options=cardio_options_sorted,
|
|
default=cardio_options_sorted,
|
|
help="0: 无疾病, 1: 有疾病"
|
|
)
|
|
cardio_filter = cardio_selected
|
|
|
|
return age_range, gender_filter, cardio_filter
|
|
|
|
def apply_filters(df, age_range, gender_filter, cardio_filter):
|
|
"""Apply filters to DataFrame."""
|
|
filtered_df = df.copy()
|
|
|
|
# Apply age filter
|
|
if 'age_years' in filtered_df.columns:
|
|
filtered_df = filtered_df[
|
|
(filtered_df['age_years'] >= age_range[0]) &
|
|
(filtered_df['age_years'] <= age_range[1])
|
|
]
|
|
|
|
# Apply gender filter
|
|
if gender_filter is not None and 'gender' in filtered_df.columns:
|
|
filtered_df = filtered_df[filtered_df['gender'].isin(gender_filter)]
|
|
|
|
# Apply cardio filter
|
|
if cardio_filter is not None and 'cardio' in filtered_df.columns:
|
|
filtered_df = filtered_df[filtered_df['cardio'].isin(cardio_filter)]
|
|
|
|
return filtered_df
|
|
|
|
def display_summary_metrics(df):
|
|
"""Display summary metrics."""
|
|
col1, col2, col3, col4 = st.columns(4)
|
|
|
|
with col1:
|
|
total_records = len(df)
|
|
st.metric("📈 总记录数", f"{total_records:,}")
|
|
|
|
with col2:
|
|
if 'cardio' in df.columns:
|
|
risk_rate = df['cardio'].mean() * 100
|
|
st.metric("❤️ 心血管疾病风险率", f"{risk_rate:.2f}%")
|
|
else:
|
|
st.metric("❤️ 心血管疾病风险率", "N/A")
|
|
|
|
with col3:
|
|
if 'age_years' in df.columns:
|
|
avg_age = df['age_years'].mean()
|
|
st.metric("👥 平均年龄", f"{avg_age:.1f} 岁")
|
|
else:
|
|
st.metric("👥 平均年龄", "N/A")
|
|
|
|
with col4:
|
|
if 'bmi' in df.columns:
|
|
avg_bmi = df['bmi'].mean()
|
|
st.metric("⚖️ 平均BMI", f"{avg_bmi:.1f}")
|
|
else:
|
|
st.metric("⚖️ 平均BMI", "N/A")
|
|
|
|
st.markdown("---")
|
|
|
|
def create_visualizations(df):
|
|
"""Create interactive visualizations."""
|
|
# Create two columns for charts
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
st.subheader("📊 年龄分布 (按心血管疾病状态)")
|
|
if 'age_years' in df.columns and 'cardio' in df.columns:
|
|
# Convert cardio to string for better legend
|
|
df['cardio_str'] = df['cardio'].apply(lambda x: '有疾病' if x == 1 else '无疾病')
|
|
|
|
fig1 = px.histogram(
|
|
df,
|
|
x='age_years',
|
|
color='cardio_str',
|
|
nbins=30,
|
|
barmode='overlay',
|
|
opacity=0.7,
|
|
color_discrete_sequence=['#FF6B6B', '#4ECDC4'],
|
|
labels={'age_years': '年龄 (岁)', 'cardio_str': '心血管疾病状态'}
|
|
)
|
|
fig1.update_layout(
|
|
legend_title="疾病状态",
|
|
xaxis_title="年龄 (岁)",
|
|
yaxis_title="人数",
|
|
bargap=0.1
|
|
)
|
|
st.plotly_chart(fig1, use_container_width=True)
|
|
else:
|
|
st.info("年龄或心血管疾病状态数据不可用")
|
|
|
|
with col2:
|
|
st.subheader("📈 BMI类别与心血管疾病关系")
|
|
if 'bmi_category' in df.columns and 'cardio' in df.columns:
|
|
# Create cross-tabulation
|
|
cross_tab = pd.crosstab(df['bmi_category'], df['cardio'], normalize='index') * 100
|
|
|
|
# Prepare data for stacked bar chart
|
|
categories = cross_tab.index.tolist()
|
|
no_disease = cross_tab[0].values if 0 in cross_tab.columns else [0] * len(categories)
|
|
has_disease = cross_tab[1].values if 1 in cross_tab.columns else [0] * len(categories)
|
|
|
|
fig2 = go.Figure(data=[
|
|
go.Bar(name='无疾病', x=categories, y=no_disease, marker_color='#4ECDC4'),
|
|
go.Bar(name='有疾病', x=categories, y=has_disease, marker_color='#FF6B6B')
|
|
])
|
|
|
|
fig2.update_layout(
|
|
barmode='stack',
|
|
xaxis_title="BMI 类别",
|
|
yaxis_title="百分比 (%)",
|
|
legend_title="疾病状态",
|
|
yaxis=dict(range=[0, 100])
|
|
)
|
|
st.plotly_chart(fig2, use_container_width=True)
|
|
else:
|
|
st.info("BMI类别或心血管疾病状态数据不可用")
|
|
|
|
# Additional visualizations
|
|
st.markdown("---")
|
|
st.subheader("🔍 数据详情")
|
|
|
|
# Show filtered data
|
|
with st.expander("查看筛选后的数据 (前100行)"):
|
|
st.dataframe(df.head(100), use_container_width=True)
|
|
|
|
# Data statistics
|
|
with st.expander("查看数据统计摘要"):
|
|
if not df.empty:
|
|
st.write("**数值型变量统计:**")
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
st.dataframe(df[numeric_cols].describe(), use_container_width=True)
|
|
|
|
st.write("**分类变量统计:**")
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
for col in categorical_cols:
|
|
if col in df.columns:
|
|
st.write(f"**{col}**:")
|
|
value_counts = df[col].value_counts()
|
|
st.dataframe(value_counts, use_container_width=True)
|
|
|
|
def main():
|
|
"""Main application function."""
|
|
# Title and description
|
|
st.title("❤️ CardioAI - 心血管疾病智能辅助系统")
|
|
st.markdown("""
|
|
### 模块1: 数据可视化仪表板
|
|
本模块提供心血管疾病数据的交互式探索、清洗和可视化功能。
|
|
""")
|
|
|
|
# Load and clean data
|
|
with st.spinner("正在加载和清洗数据..."):
|
|
df = load_and_clean_data()
|
|
|
|
if df.empty:
|
|
st.error("无法加载数据。请检查数据文件路径和格式。")
|
|
return
|
|
|
|
# Display dataset information
|
|
with st.expander("📋 数据集信息", expanded=False):
|
|
st.write(f"**总记录数:** {len(df):,}")
|
|
st.write(f"**特征数:** {len(df.columns)}")
|
|
st.write("**列名:**", ", ".join(df.columns.tolist()))
|
|
|
|
# Show missing values
|
|
missing_values = df.isnull().sum()
|
|
if missing_values.any():
|
|
st.write("**缺失值:**")
|
|
st.dataframe(missing_values[missing_values > 0].rename('缺失数量'))
|
|
else:
|
|
st.write("**缺失值:** 无")
|
|
|
|
# Create filters in sidebar
|
|
age_range, gender_filter, cardio_filter = create_filters(df)
|
|
|
|
# Apply filters
|
|
filtered_df = apply_filters(df, age_range, gender_filter, cardio_filter)
|
|
|
|
# Display summary metrics
|
|
display_summary_metrics(filtered_df)
|
|
|
|
# Create visualizations
|
|
create_visualizations(filtered_df)
|
|
|
|
# Footer
|
|
st.markdown("---")
|
|
st.markdown(
|
|
"""
|
|
<div style='text-align: center; color: gray;'>
|
|
<p>CardioAI - 心血管疾病智能辅助系统 | 模块1: 数据可视化仪表板</p>
|
|
<p>使用 Streamlit 和 Plotly 构建</p>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
main() |