import pandas as pd import numpy as np # 定义真实的条件概率表 (Ground Truth CPTs) # 学生需要从生成的数据中“反向学习”出这些概率 PROBS = { 'IsSmoker': 0.2, 'VisitAsia': 0.05, 'HasLungCancer': { # Given IsSmoker (1,): 0.1, (0,): 0.01 }, 'HasTuberculosis': { # Given VisitAsia (1,): 0.08, (0,): 0.001 }, 'XRayPositive': { # Given HasLungCancer, HasTuberculosis (1, 1): 0.98, (1, 0): 0.95, (0, 1): 0.90, (0, 0): 0.05 }, 'HasDyspnea': { # Given HasLungCancer (1,): 0.7, (0,): 0.2 } } def generate_data(num_samples=1000): data = [] for _ in range(num_samples): # Sample from parent nodes is_smoker = np.random.binomial(1, PROBS['IsSmoker']) visit_asia = np.random.binomial(1, PROBS['VisitAsia']) # Sample from children nodes based on parents p_lc = PROBS['HasLungCancer'][(is_smoker,)] has_lc = np.random.binomial(1, p_lc) p_tb = PROBS['HasTuberculosis'][(visit_asia,)] has_tb = np.random.binomial(1, p_tb) p_xray = PROBS['XRayPositive'][(has_lc, has_tb)] xray_pos = np.random.binomial(1, p_xray) p_dys = PROBS['HasDyspnea'][(has_lc,)] has_dys = np.random.binomial(1, p_dys) data.append([is_smoker, visit_asia, has_lc, has_tb, xray_pos, has_dys]) df = pd.DataFrame(data, columns=['IsSmoker', 'VisitAsia', 'HasLungCancer', 'HasTuberculosis', 'XRayPositive', 'HasDyspnea']) df.to_csv('data.csv', index=False) print(f"Generated data.csv with {num_samples} samples.") if __name__ == '__main__': generate_data(num_samples=2000)