Files
cs188/bayesian/generate_data.py
2026-01-01 17:27:18 +08:00

57 lines
1.8 KiB
Python

import pandas as pd
import numpy as np
# 定义真实的条件概率表 (Ground Truth CPTs)
# 学生需要从生成的数据中“反向学习”出这些概率
PROBS = {
'IsSmoker': 0.2,
'VisitAsia': 0.05,
'HasLungCancer': { # Given IsSmoker
(1,): 0.1,
(0,): 0.01
},
'HasTuberculosis': { # Given VisitAsia
(1,): 0.08,
(0,): 0.001
},
'XRayPositive': { # Given HasLungCancer, HasTuberculosis
(1, 1): 0.98,
(1, 0): 0.95,
(0, 1): 0.90,
(0, 0): 0.05
},
'HasDyspnea': { # Given HasLungCancer
(1,): 0.7,
(0,): 0.2
}
}
def generate_data(num_samples=1000):
data = []
for _ in range(num_samples):
# Sample from parent nodes
is_smoker = np.random.binomial(1, PROBS['IsSmoker'])
visit_asia = np.random.binomial(1, PROBS['VisitAsia'])
# Sample from children nodes based on parents
p_lc = PROBS['HasLungCancer'][(is_smoker,)]
has_lc = np.random.binomial(1, p_lc)
p_tb = PROBS['HasTuberculosis'][(visit_asia,)]
has_tb = np.random.binomial(1, p_tb)
p_xray = PROBS['XRayPositive'][(has_lc, has_tb)]
xray_pos = np.random.binomial(1, p_xray)
p_dys = PROBS['HasDyspnea'][(has_lc,)]
has_dys = np.random.binomial(1, p_dys)
data.append([is_smoker, visit_asia, has_lc, has_tb, xray_pos, has_dys])
df = pd.DataFrame(data, columns=['IsSmoker', 'VisitAsia', 'HasLungCancer',
'HasTuberculosis', 'XRayPositive', 'HasDyspnea'])
df.to_csv('data.csv', index=False)
print(f"Generated data.csv with {num_samples} samples.")
if __name__ == '__main__':
generate_data(num_samples=2000)