57 lines
1.8 KiB
Python
57 lines
1.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
|
|
# 定义真实的条件概率表 (Ground Truth CPTs)
|
|
# 学生需要从生成的数据中“反向学习”出这些概率
|
|
PROBS = {
|
|
'IsSmoker': 0.2,
|
|
'VisitAsia': 0.05,
|
|
'HasLungCancer': { # Given IsSmoker
|
|
(1,): 0.1,
|
|
(0,): 0.01
|
|
},
|
|
'HasTuberculosis': { # Given VisitAsia
|
|
(1,): 0.08,
|
|
(0,): 0.001
|
|
},
|
|
'XRayPositive': { # Given HasLungCancer, HasTuberculosis
|
|
(1, 1): 0.98,
|
|
(1, 0): 0.95,
|
|
(0, 1): 0.90,
|
|
(0, 0): 0.05
|
|
},
|
|
'HasDyspnea': { # Given HasLungCancer
|
|
(1,): 0.7,
|
|
(0,): 0.2
|
|
}
|
|
}
|
|
|
|
def generate_data(num_samples=1000):
|
|
data = []
|
|
for _ in range(num_samples):
|
|
# Sample from parent nodes
|
|
is_smoker = np.random.binomial(1, PROBS['IsSmoker'])
|
|
visit_asia = np.random.binomial(1, PROBS['VisitAsia'])
|
|
|
|
# Sample from children nodes based on parents
|
|
p_lc = PROBS['HasLungCancer'][(is_smoker,)]
|
|
has_lc = np.random.binomial(1, p_lc)
|
|
|
|
p_tb = PROBS['HasTuberculosis'][(visit_asia,)]
|
|
has_tb = np.random.binomial(1, p_tb)
|
|
|
|
p_xray = PROBS['XRayPositive'][(has_lc, has_tb)]
|
|
xray_pos = np.random.binomial(1, p_xray)
|
|
|
|
p_dys = PROBS['HasDyspnea'][(has_lc,)]
|
|
has_dys = np.random.binomial(1, p_dys)
|
|
|
|
data.append([is_smoker, visit_asia, has_lc, has_tb, xray_pos, has_dys])
|
|
|
|
df = pd.DataFrame(data, columns=['IsSmoker', 'VisitAsia', 'HasLungCancer',
|
|
'HasTuberculosis', 'XRayPositive', 'HasDyspnea'])
|
|
df.to_csv('data.csv', index=False)
|
|
print(f"Generated data.csv with {num_samples} samples.")
|
|
|
|
if __name__ == '__main__':
|
|
generate_data(num_samples=2000) |