148 lines
3.8 KiB
Python
148 lines
3.8 KiB
Python
import streamlit as st
|
||
import pandas as pd
|
||
import numpy as np
|
||
import plotly.express as px
|
||
|
||
# 数据路径
|
||
DATA_PATH = "D:\\AI_Coding\\data\\心血管疾病.xlsx"
|
||
|
||
# 缓存数据加载和清洗函数
|
||
@st.cache_data
|
||
def load_and_preprocess_data():
|
||
# 加载数据
|
||
df = pd.read_excel(DATA_PATH)
|
||
|
||
# 特征工程
|
||
# 将age(天)转换为年(四舍五入)
|
||
df['age_years'] = round(df['age'] / 365.25, 0)
|
||
|
||
# 计算BMI
|
||
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
|
||
|
||
# 异常值处理
|
||
# 删除舒张压≥收缩压的记录
|
||
df = df[df['ap_lo'] < df['ap_hi']]
|
||
|
||
# 删除血压极端异常值
|
||
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 250)]
|
||
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 150)]
|
||
|
||
# 类别转换
|
||
# 将cholesterol数值转换为描述性字符串
|
||
cholesterol_map = {1: '正常', 2: '偏高', 3: '很高'}
|
||
df['cholesterol_desc'] = df['cholesterol'].map(cholesterol_map)
|
||
|
||
# 将gluc数值转换为描述性字符串
|
||
gluc_map = {1: '正常', 2: '偏高', 3: '很高'}
|
||
df['gluc_desc'] = df['gluc'].map(gluc_map)
|
||
|
||
# 创建bmi_category
|
||
def get_bmi_category(bmi):
|
||
if bmi < 18.5:
|
||
return '偏瘦'
|
||
elif bmi < 24:
|
||
return '正常'
|
||
elif bmi < 28:
|
||
return '超重'
|
||
else:
|
||
return '肥胖'
|
||
|
||
df['bmi_category'] = df['bmi'].apply(get_bmi_category)
|
||
|
||
return df
|
||
|
||
# 加载数据
|
||
df = load_and_preprocess_data()
|
||
|
||
# Streamlit应用
|
||
st.title('CardioAI - 心血管疾病数据可视化')
|
||
|
||
# 侧边栏筛选器
|
||
st.sidebar.header('数据筛选')
|
||
|
||
# age_years范围滑块
|
||
age_min = int(df['age_years'].min())
|
||
age_max = int(df['age_years'].max())
|
||
age_range = st.sidebar.slider(
|
||
'年龄范围',
|
||
min_value=age_min,
|
||
max_value=age_max,
|
||
value=(age_min, age_max)
|
||
)
|
||
|
||
# gender多选框
|
||
gender_options = df['gender'].unique()
|
||
gender_filter = st.sidebar.multiselect(
|
||
'性别 (1=女, 2=男)',
|
||
options=gender_options,
|
||
default=gender_options
|
||
)
|
||
|
||
# cardio多选框
|
||
cardio_options = df['cardio'].unique()
|
||
cardio_filter = st.sidebar.multiselect(
|
||
'心血管疾病 (0=无, 1=有)',
|
||
options=cardio_options,
|
||
default=cardio_options
|
||
)
|
||
|
||
# 应用筛选器
|
||
filtered_df = df[
|
||
(df['age_years'] >= age_range[0]) &
|
||
(df['age_years'] <= age_range[1]) &
|
||
(df['gender'].isin(gender_filter)) &
|
||
(df['cardio'].isin(cardio_filter))
|
||
]
|
||
|
||
# 主页展示
|
||
st.header('数据概览')
|
||
|
||
# 展示筛选后的总记录数
|
||
st.metric('筛选后的总记录数', len(filtered_df))
|
||
|
||
# 计算并展示心血管疾病总风险率
|
||
if len(filtered_df) > 0:
|
||
cardio_rate = (filtered_df['cardio'].sum() / len(filtered_df)) * 100
|
||
st.metric('心血管疾病总风险率', f"{cardio_rate:.2f}%")
|
||
|
||
# 图表
|
||
st.header('数据可视化')
|
||
|
||
# age_years分布直方图(按cardio区分)
|
||
st.subheader('年龄分布与心血管疾病关系')
|
||
fig_age = px.histogram(
|
||
filtered_df,
|
||
x='age_years',
|
||
color='cardio',
|
||
barmode='overlay',
|
||
nbins=20,
|
||
labels={'age_years': '年龄', 'cardio': '心血管疾病', 'count': '人数'},
|
||
title='年龄分布 histogram'
|
||
)
|
||
fig_age.update_layout(
|
||
xaxis_title='年龄',
|
||
yaxis_title='人数',
|
||
legend_title='心血管疾病'
|
||
)
|
||
st.plotly_chart(fig_age)
|
||
|
||
# bmi_category对cardio影响的堆叠柱状图
|
||
st.subheader('BMI类别与心血管疾病关系')
|
||
fig_bmi = px.histogram(
|
||
filtered_df,
|
||
x='bmi_category',
|
||
color='cardio',
|
||
barmode='stack',
|
||
labels={'bmi_category': 'BMI类别', 'cardio': '心血管疾病', 'count': '人数'},
|
||
title='BMI类别对心血管疾病影响的堆叠柱状图'
|
||
)
|
||
fig_bmi.update_layout(
|
||
xaxis_title='BMI类别',
|
||
yaxis_title='人数',
|
||
legend_title='心血管疾病'
|
||
)
|
||
st.plotly_chart(fig_bmi)
|
||
|
||
# 展示数据样本
|
||
st.header('数据样本')
|
||
st.dataframe(filtered_df.head()) |