BiasAnalyzerCore/tests/query_based/test_hierarchical_prevalence.py at 4a296b418415f7936a993679e7d3c035a8d5c679 · VACLab/BiasAnalyzerCore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import pytest
from numpy.ma.testutils import assert_equal

from biasanalyzer.concept import ConceptHierarchy


def test_cohort_concept_hierarchical_prevalence(test_db, caplog):
    bias = test_db
    cohort_query = """
        SELECT person_id, condition_concept_id,
        condition_start_date as cohort_start_date,
        condition_end_date as cohort_end_date
        FROM condition_occurrence;
    """

    cohort = bias.create_cohort(
        "Diabetes Cohort", "Cohort of patients with diabetes-related conditions", cohort_query, "test_user"
    )
    # Test cohort object and methods
    assert cohort is not None, "Cohort creation failed"
    # test concept_type must be one of the supported OMOP domain name
    with pytest.raises(ValueError):
        cohort.get_concept_stats(concept_type="dummy_invalid")

    # test vocab must be None to use the default vocab or one of the supported OMOP vocabulary id
    with pytest.raises(ValueError):
        cohort.get_concept_stats(vocab="dummy_invalid_vocab")

    # test the cohort does not have procedure_occurrence related concepts
    cohort_stat, _ = cohort.get_concept_stats(concept_type="procedure_occurrence")
    assert_equal(cohort_stat, {'procedure_occurrence': []})

    concept_stats, _ = cohort.get_concept_stats(vocab="ICD10CM", print_concept_hierarchy=True)
    assert concept_stats is not None, "Failed to fetch concept stats"
    assert len(concept_stats) > 0, "No concept stats returned"
    # check returned data
    assert not all(
        s["ancestor_concept_id"] == s["descendant_concept_id"] for s in concept_stats["condition_occurrence"]
    ), "Some ancestor_concept_id and descendant_concept_id should differ"
    # Check concept prevalence for overlaps
    diabetes_prevalence = next(
        (
            c
            for c in concept_stats["condition_occurrence"]
            if c["ancestor_concept_id"] == 1 and c["descendant_concept_id"] == 1
        ),
        None,
    )
    assert diabetes_prevalence is not None, "Parent diabetes concept prevalence missing"
    type1_prevalence = next(
        (
            c
            for c in concept_stats["condition_occurrence"]
            if c["ancestor_concept_id"] == 2 and c["descendant_concept_id"] == 2
        ),
        None,
    )
    assert type1_prevalence is not None, "Child type 1 diabetes concept prevalence missing"
    type2_prevalence = next(
        (
            c
            for c in concept_stats["condition_occurrence"]
            if c["ancestor_concept_id"] == 3 and c["descendant_concept_id"] == 3
        ),
        None,
    )
    assert type2_prevalence is not None, "Child type 2 diabetes concept prevalence missing"
    print(
        f"type1_prevalence: {type1_prevalence['prevalence']}, type2_prevalence: {type2_prevalence['prevalence']}, "
        f"diabetes_prevalence: {diabetes_prevalence['prevalence']}"
    )
    assert diabetes_prevalence["prevalence"] < type1_prevalence["prevalence"] + type2_prevalence["prevalence"], (
        "Parent diabetes concept prevalence does not reflect overlap between type 1 and type 2 diabetes "
        "children concept prevalence"
    )


def test_identifier_normalization_and_cache():
    ConceptHierarchy.clear_cache()
    # identifiers are normalized
    assert ConceptHierarchy._normalize_identifier("2+1") == "1+2"
    assert ConceptHierarchy._normalize_identifier("1+2+2") == "1+2"

    # fake minimal results to build hierarchy
    results1 = [
        {
            "ancestor_concept_id": 1,
            "descendant_concept_id": 1,
            "concept_name": "Diabetes",
            "concept_code": "DIA",
            "count_in_cohort": 5,
            "prevalence": 0.5,
        }
    ]
    results2 = [
        {
            "ancestor_concept_id": 1,
            "descendant_concept_id": 1,
            "concept_name": "Diabetes2",
            "concept_code": "DIA",
            "count_in_cohort": 15,
            "prevalence": 0.15,
        }
    ]
    h1 = ConceptHierarchy.build_concept_hierarchy_from_results(1, "condition_occurrence", results1)
    h2 = ConceptHierarchy.build_concept_hierarchy_from_results(1, "condition_occurrence", results2)
    assert h1 is h2  # cache reuse even though results2 is different from results1
    assert h1.identifier == "1-condition_occurrence-0-None"
    h2 = ConceptHierarchy.build_concept_hierarchy_from_results(1, "drug_exposure", results2)
    assert h1 is not h2  # cache is not used since drug_exposure concept_name is different than the cached
    # condition_occurrence
    assert h2.identifier == "1-drug_exposure-0-None"


def test_union_and_cache_behavior():
    ConceptHierarchy.clear_cache()
    results1 = [
        {
            "ancestor_concept_id": 1,
            "descendant_concept_id": 1,
            "concept_name": "Diabetes",
            "concept_code": "DIA",
            "count_in_cohort": 5,
            "prevalence": 0.5,
        }
    ]
    results2 = [
        {
            "ancestor_concept_id": 2,
            "descendant_concept_id": 2,
            "concept_name": "Hypertension",
            "concept_code": "HYP",
            "count_in_cohort": 3,
            "prevalence": 0.3,
        }
    ]

    h1 = ConceptHierarchy.build_concept_hierarchy_from_results(1, "condition_occurrence", results1)
    h2 = ConceptHierarchy.build_concept_hierarchy_from_results(2, "condition_occurrence", results2)
    assert "1-condition_occurrence-0-None" in ConceptHierarchy._graph_cache
    assert "2-condition_occurrence-0-None" in ConceptHierarchy._graph_cache
    h12 = h1.union(h2)
    h21 = h2.union(h1)
    assert h12.identifier == "1-condition_occurrence-0-None+2-condition_occurrence-0-None"
    assert h21.identifier == "1-condition_occurrence-0-None+2-condition_occurrence-0-None"
    assert h12 is h21


def test_traversal_and_serialization():
    ConceptHierarchy.clear_cache()
    results = [
        {
            "ancestor_concept_id": 1,
            "descendant_concept_id": 1,
            "concept_name": "Root",
            "concept_code": "R",
            "count_in_cohort": 5,
            "prevalence": 0.5,
        },
        {
            "ancestor_concept_id": 1,
            "descendant_concept_id": 2,
            "concept_name": "Child",
            "concept_code": "C",
            "count_in_cohort": 2,
            "prevalence": 0.2,
        },
    ]
    h = ConceptHierarchy.build_concept_hierarchy_from_results(1, "condition_occurrence", results)

    # roots
    roots = h.get_root_nodes()
    assert len(roots) == 1
    assert roots[0].name == "Root"
    assert roots[0].get_metrics(1) == {"count": 5, "prevalence": 0.5}
    children = roots[0].children
    ch_names = [ch.name for ch in children]
    assert ch_names == ["Child"]
    # leaves
    leaf_nodes = h.get_leaf_nodes(serialization=True)
    assert leaf_nodes == [
        {
            "concept_id": 2,
            "concept_name": "Child",
            "concept_code": "C",
            "metrics": {"1": {"count": 2, "prevalence": 0.2}},
            "source_cohorts": [1],
            "parent_ids": [1],
        }
    ]

    leaves = h.get_leaf_nodes()
    assert len(leaves) == 1
    assert leaves[0].name == "Child"
    parents = leaves[0].parents
    par_names = [par.name for par in parents]
    assert par_names == ["Root"]

    assert h.get_node(1, serialization=True) == {
        "concept_id": 1,
        "concept_name": "Root",
        "concept_code": "R",
        "metrics": {"1": {"count": 5, "prevalence": 0.5}},
        "source_cohorts": [1],
        "parent_ids": [],
    }

    # graph traversal
    with pytest.raises(ValueError):
        # make sure to use list() to force generator execution
        # test invalid root_id raises ValueError
        list(h.iter_nodes(111, order="bfs"))

    with pytest.raises(ValueError):
        # make sure to use list() to force generator execution
        # test invalid order raises ValueError
        list(h.iter_nodes(1, order="dummy"))

    bfs_nodes = [n.id for n in h.iter_nodes(1, order="bfs")]
    assert bfs_nodes == [1, 2]

    # DFS traversal
    dfs_nodes = [n.id for n in h.iter_nodes(1, order="dfs")]
    assert set(dfs_nodes) == {1, 2}

    dfs_nodes = [n["concept_id"] for n in h.iter_nodes(1, order="dfs", serialization=True)]
    assert set(dfs_nodes) == {1, 2}

    # serialization
    serialized_root = h.get_root_nodes(serialization=True)[0]
    assert serialized_root["concept_name"] == "Root"
    assert "metrics" in serialized_root

    serialized_iter = list(h.iter_nodes(1, serialization=True))
    assert all(isinstance(n, dict) for n in serialized_iter)
    assert serialized_iter[0]["concept_id"] == 1

    with pytest.raises(ValueError):
        h.to_dict(111)

    h_dict = h.to_dict(1, include_union_metrics=True)
    assert h_dict == {
        "hierarchy": [
            {
                "concept_id": 1,
                "concept_name": "Root",
                "concept_code": "R",
                "metrics": {"union": {"count": 5, "prevalence": 0.5}, "1": {"count": 5, "prevalence": 0.5}},
                "source_cohorts": [1],
                "parent_ids": [],
                "children": [
                    {
                        "concept_id": 2,
                        "concept_name": "Child",
                        "concept_code": "C",
                        "metrics": {"union": {"count": 2, "prevalence": 0.2}, "1": {"count": 2, "prevalence": 0.2}},
                        "source_cohorts": [1],
                        "parent_ids": [1],
                        "children": [],
                    }
                ],
            }
        ]
    }

    h_dict = h.to_dict()
    assert h_dict == {
        "hierarchy": [
            {
                "concept_id": 1,
                "concept_name": "Root",
                "concept_code": "R",
                "metrics": {"1": {"count": 5, "prevalence": 0.5}},
                "source_cohorts": [1],
                "parent_ids": [],
                "children": [
                    {
                        "concept_id": 2,
                        "concept_name": "Child",
                        "concept_code": "C",
                        "metrics": {"1": {"count": 2, "prevalence": 0.2}},
                        "source_cohorts": [1],
                        "parent_ids": [1],
                        "children": [],
                    }
                ],
            }
        ]
    }