cafe.method.CondaBackend

`cafe.method.CondaBackend`

Bases: Backend

Specific implementation of abstract Backend class using Python functions.

Source code in cafe/method/fate_conda_backend.py

class CondaBackend(Backend):
    """Specific implementation of abstract Backend class using Python functions."""

    # TODO: Here, the class is only for python methods, need to extend to R methods.

    def __init__(self, function_name="comp1", conda_name="cafe", id=""):
        self.function_name = function_name
        self.conda_name = conda_name
        self.id = id
        self.load_backend()

    def load_backend(self):
        logger.debug("load conda backend")
        if self.test_conda_env() is False:
            self.install_conda_env()

        cmd = f"conda run -n {self.conda_name} python --version"
        result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10)
        if result.returncode != 0:
            logger.error(f"Conda environment '{self.conda_name}' not available: {result.stderr.strip()}")
            raise RuntimeError(f"Conda environment '{self.conda_name}' not available.")
        else:
            logger.debug(f"Conda environment '{self.conda_name}' is available: {result.stdout.strip()}", indent_level=2)

        # load function to get parameter
        self._load_function(self.function_name)

    def preprocess(self, adata: AnnData, parameters: dict, tmp_wd: str) -> None:
        """save adata h5ad, prior information and parameters json file in tmp_wd dir"""
        adata_filename = f"{tmp_wd}/adata.h5ad"
        adata.uns["filename"] = adata_filename  # save filename in uns for function use
        adata.write(filename=adata_filename)
        if settings.save_external_data or settings.save_h5ad:
            self.adata = adata  # need to save for comparison later

        with open(f"{tmp_wd}/parameters.json", "w") as f:
            json.dump(parameters, f)

    def execute(self, tmp_wd: str, benchmark_resource: False) -> dict:
        """conda run, save dict.pkl in tmp_wd dir, return trajectory_dict

        Args:
            tmp_wd (str): tmp working dir for docker mount and saving h5ad.h5, json file

        Returns:
            dict: trajectory dict
        """
        trajectory_dict = {}

        parse_args_script = f"{os.path.dirname(__file__)}/function/parse_args.py"

        # construct command
        cmd = f"""\
        python {parse_args_script} \
            --function_name={self.function_name} \
            --adata_path={tmp_wd}/adata.h5ad \
            --parameters={tmp_wd}/parameters.json \
            --output_filename={tmp_wd}/output.pkl \
        """  # tmp_wd is working dir
        if settings.save_external_data or settings.save_h5ad:
            cmd += f" --save_h5ad={tmp_wd}/output.h5ad"
        cmd = f"conda run -n {self.conda_name} --no-capture-output {cmd}"  # use conda environment to run
        if benchmark_resource:
            cmd = f"/usr/bin/time -v {cmd}"
        cmd = f"cd {tmp_wd} && {cmd}"  # set working dir, remove middle output files
        logger.debug(f"cmd: {cmd}")

        # Set environment variable for matplotlib to use a non-GUI backend
        env = os.environ.copy()
        env["MPLBACKEND"] = "Agg"

        # execuate command
        process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env)
        # remove unimportant warning log
        stderr_lines = []  # to capture stderr for latter resource usage parsing
        threading.Thread(target=print_output(logger.debug), args=(process.stdout, "[conda-excute-stdout]"), daemon=True).start()
        threading.Thread(target=print_output(logger.debug, stderr_lines), args=(process.stderr, "[conda-excute-stderr]"), daemon=True).start()
        # wait for process
        process.wait()

        # read output pkl
        output_pkl_filename = f"{tmp_wd}/output.pkl"
        if not os.path.exists(output_pkl_filename):
            # no h5 file generated by docker, show error log
            logger.error("Conda error, no output.pkl generated by conda command!!!")
        else:
            logger.debug("Conda finish")
            with open(output_pkl_filename, "rb") as f:
                trajectory_dict = pickle.load(f)
            if settings.save_external_data:
                adata_new = sc.read_h5ad(f"{tmp_wd}/output.h5ad")  # read back adata if needed
                trajectory_dict["external_data"] = extract_external_data_dict_directly(self.adata, adata_new)
                logger.debug("save external data from adata after conda execution")
            if settings.save_h5ad:
                # save entire adata object in specific file.
                import shutil

                shutil.copyfile(f"{tmp_wd}/output.h5ad", f".cafe/{self.adata.uns['id']}/h5ad/{self.id}.h5ad")
            if benchmark_resource:
                # read usage string and transfer to dict
                usage_string = "".join(stderr_lines)
                logger.debug(f"resource usage string: {usage_string}")
                usage_dict = parse_bash_resource_usage_string(usage_string)
                logger.debug(f"resource usage dict: {usage_dict}")
                trajectory_dict["resource_usage"] = usage_dict
            return trajectory_dict

    def run(
        self,
        fadata: FateAnnData,
        parameters: dict,
    ):
        """run"""
        # check if benchmark resource from parameters.
        benchmark_resource = self._check_benchmark_resource(parameters)

        # prepare data and parameters
        adata = fadata.to_anndata(delete_trajectory=True)  # avoid other trajectory IO
        adata.uns["id"] = fadata.id
        parameters = self._get_parameters(fadata, parameters)

        # execute method, save input and output file in tmp dir
        with tempfile.TemporaryDirectory() as tmp_wd:
            logger.debug(f"Temp wd: {tmp_wd}")
            self.preprocess(adata, parameters, tmp_wd)

            trajectory_dict = self.execute(tmp_wd, benchmark_resource=benchmark_resource)

            fadata.add_trajectory_by_type(trajectory_dict)  # wrapper type sorted in trajectory dict help "add_trajectory_xxx" choice.

            # add resource usage if benchmark_resource is True
            if "resource_usage" in trajectory_dict:
                fadata.add_resource_usage(trajectory_dict["resource_usage"])

    # TOOD: consider if __call__ is needed
    # def __call__(self, adata: AnnData, rewrite: bool = True, **parameters):
    #     """simplified version for self.run"""
    #     # check if benchmark resource from parameters.
    #     benchmark_resource = False
    #     if "benchmark_resource" in parameters:
    #         benchmark_resource = parameters["benchmark_resource"]
    #         del parameters["benchmark_resource"]

    #     with tempfile.TemporaryDirectory() as tmp_wd:
    #         logger.debug(f"Temp wd: {tmp_wd}")
    #         self.preprocess(adata, {}, parameters, tmp_wd)
    #         trajectory_dict = self.execute(tmp_wd, benchmark_resource=benchmark_resource)

    #         return trajectory_dict

    def __str__(self):
        return f"CondaBackend: {self.function_name} in conda env '{self.conda_name}'"

    def test_conda_env(self):
        """test if conda environment is available"""
        try:
            result = subprocess.run(["conda", "env", "list"], capture_output=True, text=True, check=True, timeout=10)
            # The output contains a list of environments, one per line.
            # The name is usually the first word on the line.
            # We should ignore comment lines starting with '#'
            lines = result.stdout.splitlines()
            for line in lines:
                if line.startswith("#"):
                    continue
                # Split the line by whitespace and get the first element
                parts = line.split()
                if parts and parts[0] == self.conda_name:
                    # logger.debug(f"Conda environment '{self.conda_name}' found.", indent_level=2)
                    return True

            logger.warning(f"Conda environment '{self.conda_name}' not found.")
            return False
        except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e:
            # FileNotFoundError if 'conda' command is not found
            # CalledProcessError if 'conda env list' returns a non-zero exit code
            logger.warning(f"Could not check for conda environments (is conda installed and in PATH?): {e}")
            return False

    def install_conda_env(self, max_waiting_time=600):
        """create environment by correspoding conda environment yml file"""
        env_file_path = os.path.join(os.path.dirname(__file__), "environment", f"{self.conda_name}.yaml")

        if not os.path.exists(env_file_path):
            logger.error(f"Conda environment yaml file not found at: {env_file_path}")
            raise FileNotFoundError(f"Conda environment yaml file not found for '{self.conda_name}'")

        logger.info(f"Creating conda environment '{self.conda_name}' from file: {env_file_path}")
        cmd = f"conda env create -f {env_file_path}"

        process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Using threading to capture output in real-time
        stdout_thread = threading.Thread(target=print_output(logger.debug), args=(process.stdout, "[conda-create-stdout]"), daemon=True)
        stderr_thread = threading.Thread(target=print_output(logger.warning), args=(process.stderr, "[conda-create-stderr]"), daemon=True)
        stdout_thread.start()
        stderr_thread.start()

        process.wait()  # Wait for the subprocess to finish

        if process.returncode != 0:
            logger.error(f"failed to create conda environment '{self.conda_name}'.")
            # The stderr is already logged by the thread
            error_message = f"""
                Failed to create conda environment '{self.conda_name}'. Check logs for details.
                Try to manually create it using following command:
                '{cmd}'
            """
            raise RuntimeError(error_message)
        else:
            logger.info(f"successfully created conda environment '{self.conda_name}'.")

    def export_conda_env(self, export_dir: str = None, rewrite=False, format="yaml") -> None:
        # export conda environment to yml file  in 'environment'
        if self.conda_name == "cafe":
            logger.info("cafe conda env, don'n need to generate external conda environment file.")
            return

        if export_dir is None:
            export_dir = f"{os.path.dirname(__file__)}/environment"

        if format == "yml" or format == "yaml":
            # yaml file
            export_dir = export_dir if export_dir else f"{os.path.dirname(__file__)}/environment"  # yaml default dir in python packages
            export_filename = f"{export_dir}/{self.conda_name}.yaml"
            cmd = f"conda env export -n {self.conda_name} --no-builds > {export_filename}"
        else:
            # TODO: tar.gz file
            export_dir = export_dir if export_dir else "."  # tar.gz file must be saved in specified dir
            export_filename = f"{export_dir}/{self.conda_name}.tar.gz"
            cmd = f"conda pack -n {self.conda_name} -o {export_filename}"

        if os.path.exists(export_filename):
            if rewrite:
                logger.info(f"export conda environment file '{export_filename}' already exists, rewrite it.")
            else:
                logger.info(f"export conda environment file '{export_filename}' already exists, skip export.")
                return

        process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        process.wait()  # very quick command, don't need reading log

        if os.path.exists(export_filename):
            logger.info(f"successfully export conda environment '{self.conda_name}' to '{export_filename}'")
        else:
            logger.error(f"failed to export conda environment '{self.conda_name}'.")
        # TODO: only need save key packages manually, delete build info and other unimportant info

    def generate_dockerfile(self, conda_env_dir: str = None, pip_requirement_dir: str = None, docker_env_dir: str = None) -> None:
        if self.conda_name == "cafe":
            logger.info("cafe conda env, can't generate dockerfile automatically, need to add it manually.")
            return

        #  read from conda yaml file and generate dockerfile automatically
        if conda_env_dir is None:
            conda_env_dir = os.path.join(os.path.dirname(__file__), "environment")
        if pip_requirement_dir is None:
            pip_requirement_dir = os.path.join(os.path.dirname(__file__), "requirement")
        if docker_env_dir is None:
            docker_env_dir = os.path.join(os.path.dirname(__file__), "Dockerfile")
        conda_env_filename = f"{conda_env_dir}/{self.function_name}.yaml"
        pip_requirement_filename = f"{pip_requirement_dir}/{self.function_name}.txt"
        docker_env_filename = f"{docker_env_dir}/{self.function_name}.dockerfile"

        with open(conda_env_filename, "r") as f:
            conda_env_dict = yaml.safe_load(f)

        # extract python version and pip packages
        python_version = "3.10.15"  # Default version
        git_needed = False
        gpu_needed = False
        for dep in conda_env_dict["dependencies"]:
            if isinstance(dep, str) and dep.startswith("python="):
                # Extract major.minor version, e.g., from 'python=3.10.18'
                python_version = dep.split("=")[1]
                logger.debug(f"detected python version: {python_version}")
            elif isinstance(dep, dict) and "pip" in dep.keys():
                pip_deps = dep["pip"]
                with open(f"{pip_requirement_filename}", "w") as f:
                    f.writelines([i + "\n" for i in pip_deps])
                    logger.debug(f"write pip requirements to '{pip_requirement_filename}'")
                if any(i.startswith("git+") for i in pip_deps):
                    git_needed = True
                if any("torch" in i or "tensorflow" in i for i in pip_deps):
                    gpu_needed = True

        # generate dockerfile from template
        with open(f"{docker_env_dir}/template.dockerfile", "r") as f:
            dockerfile_template = f.read()
        dockerfile_template = dockerfile_template.replace("$python_version", python_version)
        dockerfile_template = dockerfile_template.replace("$method_name", self.function_name)
        if git_needed:
            dockerfile_template = dockerfile_template.replace("# git installation if needed", "RUN apt-get update && apt-get install -y git")
        with open(docker_env_filename, "w") as f:
            f.write(dockerfile_template)
            logger.info(f"write dockerfile base template.dockerfile to '{docker_env_filename}'")

        #  if torch or tensorflow in packages, use choosing corresponding cuda base image carefully
        if gpu_needed:
            logger.warning("GPU packages detected, please ensure to choose appropriate CUDA base image in the generated Dockerfile.")

    def build_docker_image(self) -> None:
        # TODO: build docker image from generated dockerfile
        pass

`execute(tmp_wd, benchmark_resource)`

conda run, save dict.pkl in tmp_wd dir, return trajectory_dict

Parameters:

Name	Type	Description	Default
`tmp_wd`	`str`	tmp working dir for docker mount and saving h5ad.h5, json file	required

Returns:

Name	Type	Description
`dict`	`dict`	trajectory dict

Source code in cafe/method/fate_conda_backend.py

def execute(self, tmp_wd: str, benchmark_resource: False) -> dict:
    """conda run, save dict.pkl in tmp_wd dir, return trajectory_dict

    Args:
        tmp_wd (str): tmp working dir for docker mount and saving h5ad.h5, json file

    Returns:
        dict: trajectory dict
    """
    trajectory_dict = {}

    parse_args_script = f"{os.path.dirname(__file__)}/function/parse_args.py"

    # construct command
    cmd = f"""\
    python {parse_args_script} \
        --function_name={self.function_name} \
        --adata_path={tmp_wd}/adata.h5ad \
        --parameters={tmp_wd}/parameters.json \
        --output_filename={tmp_wd}/output.pkl \
    """  # tmp_wd is working dir
    if settings.save_external_data or settings.save_h5ad:
        cmd += f" --save_h5ad={tmp_wd}/output.h5ad"
    cmd = f"conda run -n {self.conda_name} --no-capture-output {cmd}"  # use conda environment to run
    if benchmark_resource:
        cmd = f"/usr/bin/time -v {cmd}"
    cmd = f"cd {tmp_wd} && {cmd}"  # set working dir, remove middle output files
    logger.debug(f"cmd: {cmd}")

    # Set environment variable for matplotlib to use a non-GUI backend
    env = os.environ.copy()
    env["MPLBACKEND"] = "Agg"

    # execuate command
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env)
    # remove unimportant warning log
    stderr_lines = []  # to capture stderr for latter resource usage parsing
    threading.Thread(target=print_output(logger.debug), args=(process.stdout, "[conda-excute-stdout]"), daemon=True).start()
    threading.Thread(target=print_output(logger.debug, stderr_lines), args=(process.stderr, "[conda-excute-stderr]"), daemon=True).start()
    # wait for process
    process.wait()

    # read output pkl
    output_pkl_filename = f"{tmp_wd}/output.pkl"
    if not os.path.exists(output_pkl_filename):
        # no h5 file generated by docker, show error log
        logger.error("Conda error, no output.pkl generated by conda command!!!")
    else:
        logger.debug("Conda finish")
        with open(output_pkl_filename, "rb") as f:
            trajectory_dict = pickle.load(f)
        if settings.save_external_data:
            adata_new = sc.read_h5ad(f"{tmp_wd}/output.h5ad")  # read back adata if needed
            trajectory_dict["external_data"] = extract_external_data_dict_directly(self.adata, adata_new)
            logger.debug("save external data from adata after conda execution")
        if settings.save_h5ad:
            # save entire adata object in specific file.
            import shutil

            shutil.copyfile(f"{tmp_wd}/output.h5ad", f".cafe/{self.adata.uns['id']}/h5ad/{self.id}.h5ad")
        if benchmark_resource:
            # read usage string and transfer to dict
            usage_string = "".join(stderr_lines)
            logger.debug(f"resource usage string: {usage_string}")
            usage_dict = parse_bash_resource_usage_string(usage_string)
            logger.debug(f"resource usage dict: {usage_dict}")
            trajectory_dict["resource_usage"] = usage_dict
        return trajectory_dict

`install_conda_env(max_waiting_time=600)`

create environment by correspoding conda environment yml file

Source code in cafe/method/fate_conda_backend.py

def install_conda_env(self, max_waiting_time=600):
    """create environment by correspoding conda environment yml file"""
    env_file_path = os.path.join(os.path.dirname(__file__), "environment", f"{self.conda_name}.yaml")

    if not os.path.exists(env_file_path):
        logger.error(f"Conda environment yaml file not found at: {env_file_path}")
        raise FileNotFoundError(f"Conda environment yaml file not found for '{self.conda_name}'")

    logger.info(f"Creating conda environment '{self.conda_name}' from file: {env_file_path}")
    cmd = f"conda env create -f {env_file_path}"

    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Using threading to capture output in real-time
    stdout_thread = threading.Thread(target=print_output(logger.debug), args=(process.stdout, "[conda-create-stdout]"), daemon=True)
    stderr_thread = threading.Thread(target=print_output(logger.warning), args=(process.stderr, "[conda-create-stderr]"), daemon=True)
    stdout_thread.start()
    stderr_thread.start()

    process.wait()  # Wait for the subprocess to finish

    if process.returncode != 0:
        logger.error(f"failed to create conda environment '{self.conda_name}'.")
        # The stderr is already logged by the thread
        error_message = f"""
            Failed to create conda environment '{self.conda_name}'. Check logs for details.
            Try to manually create it using following command:
            '{cmd}'
        """
        raise RuntimeError(error_message)
    else:
        logger.info(f"successfully created conda environment '{self.conda_name}'.")

`preprocess(adata, parameters, tmp_wd)`

save adata h5ad, prior information and parameters json file in tmp_wd dir

Source code in cafe/method/fate_conda_backend.py

def preprocess(self, adata: AnnData, parameters: dict, tmp_wd: str) -> None:
    """save adata h5ad, prior information and parameters json file in tmp_wd dir"""
    adata_filename = f"{tmp_wd}/adata.h5ad"
    adata.uns["filename"] = adata_filename  # save filename in uns for function use
    adata.write(filename=adata_filename)
    if settings.save_external_data or settings.save_h5ad:
        self.adata = adata  # need to save for comparison later

    with open(f"{tmp_wd}/parameters.json", "w") as f:
        json.dump(parameters, f)

`run(fadata, parameters)`

run

Source code in cafe/method/fate_conda_backend.py

def run(
    self,
    fadata: FateAnnData,
    parameters: dict,
):
    """run"""
    # check if benchmark resource from parameters.
    benchmark_resource = self._check_benchmark_resource(parameters)

    # prepare data and parameters
    adata = fadata.to_anndata(delete_trajectory=True)  # avoid other trajectory IO
    adata.uns["id"] = fadata.id
    parameters = self._get_parameters(fadata, parameters)

    # execute method, save input and output file in tmp dir
    with tempfile.TemporaryDirectory() as tmp_wd:
        logger.debug(f"Temp wd: {tmp_wd}")
        self.preprocess(adata, parameters, tmp_wd)

        trajectory_dict = self.execute(tmp_wd, benchmark_resource=benchmark_resource)

        fadata.add_trajectory_by_type(trajectory_dict)  # wrapper type sorted in trajectory dict help "add_trajectory_xxx" choice.

        # add resource usage if benchmark_resource is True
        if "resource_usage" in trajectory_dict:
            fadata.add_resource_usage(trajectory_dict["resource_usage"])

`test_conda_env()`

test if conda environment is available

Source code in cafe/method/fate_conda_backend.py

def test_conda_env(self):
    """test if conda environment is available"""
    try:
        result = subprocess.run(["conda", "env", "list"], capture_output=True, text=True, check=True, timeout=10)
        # The output contains a list of environments, one per line.
        # The name is usually the first word on the line.
        # We should ignore comment lines starting with '#'
        lines = result.stdout.splitlines()
        for line in lines:
            if line.startswith("#"):
                continue
            # Split the line by whitespace and get the first element
            parts = line.split()
            if parts and parts[0] == self.conda_name:
                # logger.debug(f"Conda environment '{self.conda_name}' found.", indent_level=2)
                return True

        logger.warning(f"Conda environment '{self.conda_name}' not found.")
        return False
    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e:
        # FileNotFoundError if 'conda' command is not found
        # CalledProcessError if 'conda env list' returns a non-zero exit code
        logger.warning(f"Could not check for conda environments (is conda installed and in PATH?): {e}")
        return False