Skip to content

Cafe - Cellular Fate Explorer

Dataset

HuangDDU/CellFateExplorer

Dataset

Dataset tools

`cafe.data.read_h5ad(*args, **kwargs)`

Read a FateAnnData object from an h5ad file.

This function wraps scanpy.read_h5ad to read the data and then converts it into a FateAnnData object. It also handles the deserialization of trajectory_history_dict (reconstructing MilestoneWrapper and WaypointWrapper objects from dictionaries).

Parameters:

Name	Type	Description	Default
`*args`		Variable length argument list passed to `scanpy.read_h5ad`.	`()`
`**kwargs`		Arbitrary keyword arguments passed to `scanpy.read_h5ad`.	`{}`

Returns:

Name	Type	Description
`FateAnnData`		The loaded FateAnnData object with restored trajectory information.

Source code in cafe/data/fate_dataset.py

def read_h5ad(*args, **kwargs):
    """Read a FateAnnData object from an h5ad file.

    This function wraps `scanpy.read_h5ad` to read the data and then converts it
    into a `FateAnnData` object. It also handles the deserialization of
    `trajectory_history_dict` (reconstructing `MilestoneWrapper` and `WaypointWrapper`
    objects from dictionaries).

    Args:
        *args: Variable length argument list passed to `scanpy.read_h5ad`.
        **kwargs: Arbitrary keyword arguments passed to `scanpy.read_h5ad`.

    Returns:
        FateAnnData: The loaded FateAnnData object with restored trajectory information.
    """
    adata = sc.read_h5ad(*args, **kwargs)
    fadata = FateAnnData.from_anndata(adata)

    def unserialize_trajectory_dict(fadata, model_name=None, recovery_raw_wrapper_dict=False):
        logger.debug(f"unserialize trajectory dict: '{model_name}'")
        trajectory_dict = fadata.get_trajectory_dict(model_name).copy()
        # parse milestone_wrapper
        milestone_wrapper = trajectory_dict.get("milestone_wrapper", None)
        if isinstance(milestone_wrapper, dict):
            # use object.__new__ to avoid __init__ function
            logger.debug(f"parse 'MilestoneWrapper' object for {model_name}")
            milestone_wrapper_obj = object.__new__(MilestoneWrapper)
            for k, v in milestone_wrapper.items():
                milestone_wrapper_obj[k] = v
            trajectory_dict["milestone_wrapper"] = milestone_wrapper_obj
        # parse waypoint_wrapper
        waypoint_wrapper = trajectory_dict.get("waypoint_wrapper", None)
        if (waypoint_wrapper is not None) and isinstance(waypoint_wrapper, dict):
            logger.debug(f"parse 'WaypointWrapper' object for {model_name}")
            waypoint_wrapper_obj = object.__new__(WaypointWrapper)
            for k, v in waypoint_wrapper.items():
                waypoint_wrapper_obj[k] = v
            trajectory_dict["waypoint_wrapper"] = waypoint_wrapper_obj
        # raw_wrapper_dict is complex, skip it
        if recovery_raw_wrapper_dict and "raw_wrapper_dict" in trajectory_dict:
            logger.debug(f"skip recovery raw_wrapper_dict in serialized trajectory dict: '{model_name}'")
        return trajectory_dict

    for k in fadata.get_all_model_name(parse=False):
        utd = unserialize_trajectory_dict(fadata, k)
        fadata.set_trajectory_dict(utd, k)

    return fadata

Builtin datasets

TODO: add ref trajectory visualization

`cafe.data.read_pancreas(filename=None, **subsample_kwargs)`

Source code in cafe/data/fate_dataset.py

def read_pancreas(filename=None, **subsample_kwargs):
    if filename is None:
        filename = f"{settings.data_dir}/Pancreas/endocrinogenesis_day15.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["Ductal", "Ngn3 low EP"],
            ["Ngn3 low EP", "Ngn3 high EP"],
            ["Ngn3 high EP", "Pre-endocrine"],
            ["Pre-endocrine", "Alpha"],
            ["Pre-endocrine", "Beta"],
            ["Pre-endocrine", "Delta"],
            ["Pre-endocrine", "Epsilon"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_cell": "cell_1103",
        "cluster": "clusters",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="pancreas",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )

    return fadata

`cafe.data.read_bonemarrow(filename=None, **subsample_kwargs)`

read case study dataset of palantir and scvelo: bone marrow

Source code in cafe/data/fate_dataset.py

def read_bonemarrow(
    filename=None,
    **subsample_kwargs,  # subsample args
):
    """read case study dataset of palantir and scvelo: bone marrow"""
    if filename is None:
        filename = f"{settings.data_dir}/BoneMarrow/setty_bone_marrow.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["HSC_1", "HSC_2"],
            ["HSC_2", "Precursors"],
            ["HSC_2", "CLP"],
            ["HSC_2", "Ery_1"],
            ["Precursors", "Mono_1"],
            ["Precursors", "DCs"],
            ["Mono_1", "Mono_2"],
            ["Ery_1", "Ery_2"],
            ["Ery_1", "Mega"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_milestone": "HSC_1",
        "start_cell": "cell_4823",
        "cluster": "clusters",
        "basis": "X_tsne",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="bonemarrow",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata

`cafe.data.read_erythroid_lineage(filename=None, **subsample_kwargs)`

Source code in cafe/data/fate_dataset.py

def read_erythroid_lineage(
    filename=None,
    **subsample_kwargs,
):
    if filename is None:
        filename = f"{settings.data_dir}/Gastrulation/erythroid_lineage.h5ad"

    milestone_network = pd.DataFrame(
        data=[
            ["Blood progenitors 1", "Blood progenitors 2"],
            ["Blood progenitors 2", "Erythroid1"],
            ["Erythroid1", "Erythroid2"],
            ["Erythroid2", "Erythroid3"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "start_cell": "cell_903",
        "end_cell": "cell_6099",
        "cluster": "celltype",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="erythroid_lineage",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata

`cafe.data.read_gastrulation(filename=None, **subsample_kwargs)`

read case study dataset: gastrulation

Source code in cafe/data/fate_dataset.py

def read_gastrulation(
    filename=None,
    **subsample_kwargs,
):
    """read case study dataset: gastrulation"""
    # 文献来源: https://www.nature.com/articles/s41586-019-1825-8
    # 轨迹参考：https://github.com/MarioniLab/EmbryoTimecourse2018/blob/master/analysis_scripts/atlas/8_graph_abstraction/graph_abstraction.ipynb
    # 其他资料：
    #   维基百科：https://zh.wikipedia.org/wiki/原肠胚形成
    #   YouTube视频：https://www.youtube.com/watch?v=w9tJ7UiLrQs
    # 外胚层（Ectoderm）：外层, 发育为表皮、神经嵴，以及之后会发育为神经系统的组织
    # 中胚层（Mesoderm）：中层，发育为真皮、脊髓、血管与血液、骨、肌肉，以及结缔组织
    # 内胚层（Endoderm）：内层，发育为消化系统和呼吸系统的上皮，比如肝和胰腺
    # 这里就能理解为何stavia要重新注释细胞了

    if filename is None:
        filename = f"{settings.data_dir}/Gastrulation/gastrulation.h5ad"

    # TODO:
    milestone_network = pd.DataFrame(
        data=[
            ["Epiblast", "Anterior Primitive Streak"],
            ["Anterior Primitive Streak", "Primitive Streak"],
            ["Blood progenitors 1", "Blood progenitors 2"],
            ["Blood progenitors 2", "Erythroid1"],
            ["Erythroid1", "Erythroid2"],
            ["Erythroid2", "Erythroid3"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        "cluster": "celltype",
        "basis": "X_umap",
    }

    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="gastrulation",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )

    return fadata

`cafe.data.read_dynverse_simulation_data(filename=None, **subsample_kwargs)`

Source code in cafe/data/fate_dataset.py

def read_dynverse_simulation_data(
    filename=None,
    **subsample_kwargs,
):
    # read dynverse simulation data and create FateAnnData object,
    if filename is None:
        filename = f"{settings.data_dir}/dynbenchmark/data/synthetic/dyntoy/bifurcating_1.rds"

    import rpy2.robjects as ro

    from ..util import rpy2_read  # rpy2 data structure transfer automatically

    rpy2_read

    r_script = f"""
        dataset <- readRDS("{filename}")
        dataset
        """
    dataset = ro.r(r_script)

    # crreate FateAnnData object base expression and count matrix
    layers = {}
    if "expression" in dataset:
        X = dataset["expression"]
        layers["expression"] = dataset["expression"]
    if "counts" in dataset:
        X = dataset["counts"]
        layers["counts"] = dataset["counts"]
    fadata = FateAnnData(name=dataset["id"], X=X)
    fadata.layers = layers

    # other Anndata attributes
    # if dataset.has_key("cell_info"):
    #     fadata.obs = dataset["cell_info"]
    fadata.obs = dataset.get("cell_info", fadata.obs)  # equal to above
    fadata.obs.index = dataset["cell_ids"]
    fadata.var = dataset.get("feature_info", fadata.obs)
    fadata.var.index = dataset.get("feature_ids", fadata.var.index)

    # call FateAnnData object method
    if "prior_information" in dataset:
        fadata.add_prior_information(**dataset["prior_information"])
    if "milestone_network" in dataset:
        milestone_network = dataset["milestone_network"].reset_index(drop=True)
        milestone_percentages = dataset["milestone_percentages"]
        divergence_regions = dataset["divergence_regions"]
        # progressions = dataset["progressions"]
        fadata.add_model_name("ref")
        fadata.add_trajectory(
            milestone_network=milestone_network,
            divergence_regions=divergence_regions,
            milestone_percentages=milestone_percentages,
            # progressions=progressions # may cover milestone_percentages
        )

    if "grouping" in dataset:
        fadata.obs["grouping"] = pd.Categorical(dataset["grouping"], dataset["group_ids"])
    # TODO: waypoint add
    return fadata

`cafe.data.read_bifurcating_cellrank(filename='../../tests/data/bifurcating.h5ad', **subsample_kwargs)`

Source code in cafe/data/fate_dataset.py

def read_bifurcating_cellrank(
    filename="../../tests/data/bifurcating.h5ad",
    **subsample_kwargs,
):
    milestone_network = pd.DataFrame(
        data=[
            ["sA -> sB", "sB -> sBmid"],
            ["sB -> sBmid", "sBmid -> sC"],
            ["sB -> sBmid", "sBmid -> sD"],
            ["sBmid -> sC", "sC -> sEndC"],
            ["sBmid -> sD", "sD -> sEndD"],
        ],
        columns=["from", "to"],
    )
    prior_information = {
        # "start_milestone": "sA -> sB",
        "cluster": "lineage",
        "basis": "X_umap",
    }
    fadata = _create_fadata_from_file(
        filename=filename,
        milestone_network=milestone_network,
        cluster=prior_information["cluster"],
        basis=prior_information["basis"],
        id="bifurcating_cellrank",
        prior_information=prior_information,
        subsample_kwargs=subsample_kwargs,
    )
    return fadata