Skip to content

Dataset

Dataset tools

cafe.data.read_h5ad(*args, **kwargs)

Read a FateAnnData object from an h5ad file.

This function wraps scanpy.read_h5ad to read the data and then converts it into a FateAnnData object. It also handles the deserialization of trajectory_history_dict (reconstructing MilestoneWrapper and WaypointWrapper objects from dictionaries).

Parameters:

Name Type Description Default
*args

Variable length argument list passed to scanpy.read_h5ad.

()
**kwargs

Arbitrary keyword arguments passed to scanpy.read_h5ad.

{}

Returns:

Name Type Description
FateAnnData

The loaded FateAnnData object with restored trajectory information.

Source code in cafe/data/fate_dataset.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def read_h5ad(*args, **kwargs):
    """Read a FateAnnData object from an h5ad file.

    This function wraps `scanpy.read_h5ad` to read the data and then converts it
    into a `FateAnnData` object. It also handles the deserialization of
    `trajectory_history_dict` (reconstructing `MilestoneWrapper` and `WaypointWrapper`
    objects from dictionaries).

    Args:
        *args: Variable length argument list passed to `scanpy.read_h5ad`.
        **kwargs: Arbitrary keyword arguments passed to `scanpy.read_h5ad`.

    Returns:
        FateAnnData: The loaded FateAnnData object with restored trajectory information.
    """
    adata = sc.read_h5ad(*args, **kwargs)
    fadata = FateAnnData.from_anndata(adata)

    def unserialize_trajectory_dict(fadata, model_name=None, recovery_raw_wrapper_dict=False):
        logger.debug(f"unserialize trajectory dict: '{model_name}'")
        trajectory_dict = fadata.get_trajectory_dict(model_name).copy()
        # parse milestone_wrapper
        milestone_wrapper = trajectory_dict.get("milestone_wrapper", None)
        if isinstance(milestone_wrapper, dict):
            # use object.__new__ to avoid __init__ function
            logger.debug(f"parse 'MilestoneWrapper' object for {model_name}")
            milestone_wrapper_obj = object.__new__(MilestoneWrapper)
            for k, v in milestone_wrapper.items():
                milestone_wrapper_obj[k] = v
            trajectory_dict["milestone_wrapper"] = milestone_wrapper_obj
        # parse waypoint_wrapper
        waypoint_wrapper = trajectory_dict.get("waypoint_wrapper", None)
        if (waypoint_wrapper is not None) and isinstance(waypoint_wrapper, dict):
            logger.debug(f"parse 'WaypointWrapper' object for {model_name}")
            waypoint_wrapper_obj = object.__new__(WaypointWrapper)
            for k, v in waypoint_wrapper.items():
                waypoint_wrapper_obj[k] = v
            trajectory_dict["waypoint_wrapper"] = waypoint_wrapper_obj
        # raw_wrapper_dict is complex, skip it
        if recovery_raw_wrapper_dict and "raw_wrapper_dict" in trajectory_dict:
            logger.debug(f"skip recovery raw_wrapper_dict in serialized trajectory dict: '{model_name}'")
        return trajectory_dict

    for k in fadata.get_all_model_name(parse=False):
        utd = unserialize_trajectory_dict(fadata, k)
        fadata.set_trajectory_dict(utd, k)

    return fadata

Builtin datasets

TODO: add ref trajectory visualization

cafe.data.read_pancreas(filename=None, **subsample_kwargs)

Source code in cafe/data/fate_dataset.py
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
def read_pancreas(filename=None, **subsample_kwargs):
    if filename is None:
        filename = f"{settings.data_dir}/Pancreas/endocrinogenesis_day15.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["Ductal", "Ngn3 low EP"],
            ["Ngn3 low EP", "Ngn3 high EP"],
            ["Ngn3 high EP", "Pre-endocrine"],
            ["Pre-endocrine", "Alpha"],
            ["Pre-endocrine", "Beta"],
            ["Pre-endocrine", "Delta"],
            ["Pre-endocrine", "Epsilon"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_cell": "cell_1103",
        "cluster": "clusters",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="pancreas",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )

    return fadata

cafe.data.read_bonemarrow(filename=None, **subsample_kwargs)

read case study dataset of palantir and scvelo: bone marrow

Source code in cafe/data/fate_dataset.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def read_bonemarrow(
    filename=None,
    **subsample_kwargs,  # subsample args
):
    """read case study dataset of palantir and scvelo: bone marrow"""
    if filename is None:
        filename = f"{settings.data_dir}/BoneMarrow/setty_bone_marrow.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["HSC_1", "HSC_2"],
            ["HSC_2", "Precursors"],
            ["HSC_2", "CLP"],
            ["HSC_2", "Ery_1"],
            ["Precursors", "Mono_1"],
            ["Precursors", "DCs"],
            ["Mono_1", "Mono_2"],
            ["Ery_1", "Ery_2"],
            ["Ery_1", "Mega"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_milestone": "HSC_1",
        "start_cell": "cell_4823",
        "cluster": "clusters",
        "basis": "X_tsne",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="bonemarrow",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata

cafe.data.read_erythroid_lineage(filename=None, **subsample_kwargs)

Source code in cafe/data/fate_dataset.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def read_erythroid_lineage(
    filename=None,
    **subsample_kwargs,
):
    if filename is None:
        filename = f"{settings.data_dir}/Gastrulation/erythroid_lineage.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["Blood progenitors 1", "Blood progenitors 2"],
            ["Blood progenitors 2", "Erythroid1"],
            ["Erythroid1", "Erythroid2"],
            ["Erythroid2", "Erythroid3"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_cell": "cell_903",
        "end_cell": "cell_6099",
        "cluster": "celltype",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="erythroid_lineage",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata

cafe.data.read_gastrulation(filename=None, **subsample_kwargs)

read case study dataset: gastrulation

Source code in cafe/data/fate_dataset.py
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def read_gastrulation(
    filename=None,
    **subsample_kwargs,
):
    """read case study dataset: gastrulation"""
    # 文献来源: https://www.nature.com/articles/s41586-019-1825-8
    # 轨迹参考:https://github.com/MarioniLab/EmbryoTimecourse2018/blob/master/analysis_scripts/atlas/8_graph_abstraction/graph_abstraction.ipynb
    # 其他资料:
    #   维基百科:https://zh.wikipedia.org/wiki/原肠胚形成
    #   YouTube视频:https://www.youtube.com/watch?v=w9tJ7UiLrQs
    # 外胚层(Ectoderm):外层, 发育为表皮、神经嵴,以及之后会发育为神经系统的组织
    # 中胚层(Mesoderm):中层,发育为真皮、脊髓、血管与血液、骨、肌肉,以及结缔组织
    # 内胚层(Endoderm):内层,发育为消化系统和呼吸系统的上皮,比如肝和胰腺
    # 这里就能理解为何stavia要重新注释细胞了

    if filename is None:
        filename = f"{settings.data_dir}/Gastrulation/gastrulation.h5ad"

    # TODO:
    milestone_network = pd.DataFrame(
        data=[
            ["Epiblast", "Anterior Primitive Streak"],
            ["Anterior Primitive Streak", "Primitive Streak"],
            ["Blood progenitors 1", "Blood progenitors 2"],
            ["Blood progenitors 2", "Erythroid1"],
            ["Erythroid1", "Erythroid2"],
            ["Erythroid2", "Erythroid3"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "cluster": "celltype",
        "basis": "X_umap",
    }

    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="gastrulation",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )

    return fadata

cafe.data.read_dynverse_simulation_data(filename=None, **subsample_kwargs)

Source code in cafe/data/fate_dataset.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def read_dynverse_simulation_data(
    filename=None,
    **subsample_kwargs,
):
    # read dynverse simulation data and create FateAnnData object,
    if filename is None:
        filename = f"{settings.data_dir}/dynbenchmark/data/synthetic/dyntoy/bifurcating_1.rds"

    import rpy2.robjects as ro

    from ..util import rpy2_read  # rpy2 data structure transfer automatically

    rpy2_read

    r_script = f"""
        dataset <- readRDS("{filename}")
        dataset
        """
    dataset = ro.r(r_script)

    # crreate FateAnnData object base expression and count matrix
    layers = {}
    if "expression" in dataset:
        X = dataset["expression"]
        layers["expression"] = dataset["expression"]
    if "counts" in dataset:
        X = dataset["counts"]
        layers["counts"] = dataset["counts"]
    fadata = FateAnnData(name=dataset["id"], X=X)
    fadata.layers = layers

    # other Anndata attributes
    # if dataset.has_key("cell_info"):
    #     fadata.obs = dataset["cell_info"]
    fadata.obs = dataset.get("cell_info", fadata.obs)  # equal to above
    fadata.obs.index = dataset["cell_ids"]
    fadata.var = dataset.get("feature_info", fadata.obs)
    fadata.var.index = dataset.get("feature_ids", fadata.var.index)

    # call FateAnnData object method
    if "prior_information" in dataset:
        fadata.add_prior_information(**dataset["prior_information"])
    if "milestone_network" in dataset:
        milestone_network = dataset["milestone_network"].reset_index(drop=True)
        milestone_percentages = dataset["milestone_percentages"]
        divergence_regions = dataset["divergence_regions"]
        # progressions = dataset["progressions"]
        fadata.add_model_name("ref")
        fadata.add_trajectory(
            milestone_network=milestone_network,
            divergence_regions=divergence_regions,
            milestone_percentages=milestone_percentages,
            # progressions=progressions # may cover milestone_percentages
        )

    if "grouping" in dataset:
        fadata.obs["grouping"] = pd.Categorical(dataset["grouping"], dataset["group_ids"])
    # TODO: waypoint add
    return fadata

cafe.data.read_bifurcating_cellrank(filename='../../tests/data/bifurcating.h5ad', **subsample_kwargs)

Source code in cafe/data/fate_dataset.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def read_bifurcating_cellrank(
    filename="../../tests/data/bifurcating.h5ad",
    **subsample_kwargs,
):
    milestone_network = pd.DataFrame(
        data=[
            ["sA -> sB", "sB -> sBmid"],
            ["sB -> sBmid", "sBmid -> sC"],
            ["sB -> sBmid", "sBmid -> sD"],
            ["sBmid -> sC", "sC -> sEndC"],
            ["sBmid -> sD", "sD -> sEndD"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        # "start_milestone": "sA -> sB",
        "cluster": "lineage",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="bifurcating_cellrank",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata