You are viewing v0.0.12 version. Click here to see docs for the latest stable version.
Source code for runhouse.resources.provenance

import copy
import json
import logging
import sys
from enum import Enum
from io import StringIO
from pathlib import Path
from typing import Any, List, Optional, Union

from runhouse.globals import configs, obj_store, rns_client
from runhouse.resources.blobs import file

# Need to alias so it doesn't conflict with the folder property
from runhouse.resources.folders import Folder, folder as folder_factory
from runhouse.resources.hardware import _current_cluster, _get_cluster_from, Cluster
from runhouse.resources.resource import Resource
from runhouse.rns.top_level_rns_fns import resolve_rns_path
from runhouse.rns.utils.api import log_timestamp, resolve_absolute_path

# Load the root logger
logger = logging.getLogger("")


class RunStatus(str, Enum):
    NOT_STARTED = "NOT_STARTED"
    RUNNING = "RUNNING"
    COMPLETED = "COMPLETED"
    CANCELLED = "CANCELLED"
    ERROR = "ERROR"


class RunType(str, Enum):
    CMD_RUN = "CMD"
    FUNCTION_RUN = "FUNCTION"
    CTX_MANAGER = "CTX_MANAGER"


[docs]class Run(Resource):
    RESOURCE_TYPE = "run"

    LOCAL_RUN_PATH = f"{rns_client.rh_directory}/runs"

    RUN_CONFIG_FILE = "config_for_run.json"
    RESULT_FILE = "result.pkl"
    INPUTS_FILE = "inputs.pkl"

[docs]    def __init__(
        self,
        name: str = None,
        fn_name: str = None,
        cmds: list = None,
        log_dest: str = "file",
        path: str = None,
        system: Union[str, Cluster] = None,
        data_config: dict = None,
        status: RunStatus = RunStatus.NOT_STARTED,
        start_time: Optional[str] = None,
        end_time: Optional[str] = None,
        creator: Optional[str] = None,
        creation_stacktrace: Optional[str] = None,
        upstream_artifacts: Optional[List] = None,
        downstream_artifacts: Optional[List] = None,
        run_type: RunType = RunType.CMD_RUN,
        error: Optional[str] = None,
        error_traceback: Optional[str] = None,
        overwrite: bool = False,
        dryrun: bool = False,
        **kwargs,
    ):
        """
        Runhouse Run object

        .. note::
            To load an existing Run, please use the factory method :func:`run`.
        """
        run_name = name or str(self._current_timestamp())
        super().__init__(name=run_name, dryrun=dryrun)

        self.log_dest = log_dest
        self.folder = None
        if self.log_dest == "file":
            folder_system = system or Folder.DEFAULT_FS
            folder_path = (
                resolve_absolute_path(path)
                if path
                else (
                    self._base_local_folder_path(self.name)
                    if folder_system == Folder.DEFAULT_FS
                    else self._base_cluster_folder_path(name=run_name)
                )
            )

            if overwrite:
                # Delete the Run from the system if one already exists
                self._delete_existing_run(folder_path, folder_system)

            # Create new folder which lives on the system and contains all the Run's data:
            # (run config, stdout, stderr, inputs, result)
            self.folder = folder_factory(
                path=folder_path,
                system=folder_system,
                data_config=data_config,
                dryrun=dryrun,
            )

        self.status = status
        self.start_time = start_time
        self.end_time = end_time
        self.creator = creator
        self.creation_stacktrace = creation_stacktrace
        self.upstream_artifacts = upstream_artifacts or []
        self.downstream_artifacts = downstream_artifacts or []
        self.fn_name = fn_name
        self.cmds = cmds
        self.run_type = run_type or self._detect_run_type()
        self.error = error
        self.traceback = error_traceback
        # TODO string representation of inputs

    def __enter__(self):
        self.status = RunStatus.RUNNING
        self.start_time = self._current_timestamp()

        # Begin tracking the Run in the rns_client - this adds the current Run to the stack of active Runs
        rns_client.start_run(self)

        if self.log_dest == "file":
            # Capture stdout and stderr to the Run's folder
            self.folder.mkdir()
            # TODO fix the fact that we keep appending and then stream back the full file
            sys.stdout = StreamTee(sys.stdout, [Path(self._stdout_path).open(mode="a")])
            sys.stderr = StreamTee(sys.stderr, [Path(self._stderr_path).open(mode="a")])

            # Add the stdout and stderr handlers to the root logger
            self._stdout_handler = logging.StreamHandler(sys.stdout)
            logger.addHandler(self._stdout_handler)

        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.end_time = self._current_timestamp()
        if exc_type:
            self.status = RunStatus.ERROR
            self.error = exc_value
            self.traceback = exc_traceback
        else:
            self.status = RunStatus.COMPLETED

        # Pop the current Run from the stack of active Runs
        rns_client.stop_run()

        # if self.run_type == RunType.CMD_RUN:
        #     # Save Run config to its folder on the system - this will already happen on the cluster
        #     # for function based Runs
        #     self._write_config()
        #
        #     # For cmd runs we are using the SSH command runner to get the stdout / stderr
        #     return

        # TODO [DG->JL] Do we still need this?
        # stderr = f"{type(exc_value).__name__}: {str(exc_value)}" if exc_value else ""
        # self.write(data=stderr.encode(), path=self._stderr_path)

        if self.log_dest == "file":
            logger.removeHandler(self._stdout_handler)

            # Flush stdout and stderr
            # sys.stdout.flush()
            # sys.stderr.flush()

            # Restore stdout and stderr
            sys.stdout = sys.stdout.instream
            sys.stderr = sys.stderr.instream

            # Save Run config to its folder on the system - this will already happen on the cluster
            # for function based Runs
            self._write_config()

        # return False to propagate any exception that occurred inside the with block
        return False

    @staticmethod
    def from_config(config: dict, dryrun=False):
        return Run(**config, dryrun=dryrun)

    def __getstate__(self):
        """Remove the folder object from the Run before pickling it."""
        state = self.__dict__.copy()
        state["folder"] = None
        state["_stdout_handler"] = None
        return state

    @property
    def config_for_rns(self):
        """Metadata to store in RNS for the Run."""
        config = super().config_for_rns
        base_config = {
            "status": self.status,
            "start_time": self.start_time,
            "end_time": self.end_time,
            "run_type": self.run_type,
            "log_dest": self.log_dest,
            "creator": self.creator,
            "fn_name": self.fn_name,
            "cmds": self.cmds,
            # NOTE: artifacts are currently only tracked in context manager based runs
            "upstream_artifacts": self.upstream_artifacts,
            "downstream_artifacts": self.downstream_artifacts,
            "path": self.folder.path,
            "system": self._resource_string_for_subconfig(self.folder.system),
            "error": str(self.error),
            "traceback": str(self.traceback),
        }
        config.update(base_config)
        return config

    def populate_init_provenance(self):
        self.creator = configs.get("username", None)
        self.creation_stacktrace = "".join(self.traceback.format_stack(limit=11)[1:])

    @property
    def run_config(self):
        """Config to save in the Run's dedicated folder on the system.
        Note: this is different from the config saved in RNS, which is the metadata for the Run.
        """
        config = {
            "name": self.name,
            "status": self.status,
            "start_time": self.start_time,
            "end_time": self.end_time,
            "run_type": self.run_type,
            "fn_name": self.fn_name,
            "cmds": self.cmds,
            # NOTE: artifacts are currently only tracked in context manager based runs
            "upstream_artifacts": self.upstream_artifacts,
            "downstream_artifacts": self.downstream_artifacts,
        }
        return config

[docs]    def save(
        self,
        name: str = None,
        overwrite: bool = True,
    ):
        """If the Run name is being overwritten (ex: initially created with auto-generated name),
        update the Run config stored on the system before saving to RNS."""
        config_for_rns = self.config_for_rns
        config_path = self._path_to_config()
        if not config_for_rns["name"] or name:
            config_for_rns["name"] = resolve_rns_path(name or self.name)
            self._write_config(config=config_for_rns)
            logger.info(f"Updated Run config name in path: {config_path}")

        return super().save(name, overwrite)

[docs]    def write(self, data: Any, path: str):
        """Write data (ex: function inputs or result, stdout, stderr) to the Run's dedicated folder on the system."""
        file(system=self.folder.system, path=path).write(data, serialize=False)

[docs]    def to(
        self,
        system,
        path: Optional[str] = None,
        data_config: Optional[dict] = None,
    ):
        """Send a Run to another system.

        Args:
            system (Union[str or Cluster]): Name of the system or Cluster object to copy the Run to.
            path (Optional[str]): Path to the on the system to save the Run.
                Defaults to the local path for Runs (in the rh folder of the working directory).
            data_config (Optional[dict]): Config to pass into fsspec handler for copying the Run.

        Returns:
            Run: A copy of the Run on the destination system and path.
        """
        # TODO: [JL] - support for `on_completion` (wait to copy the results to destination until async run completes)

        new_run = copy.copy(self)

        if self.run_type == RunType.FUNCTION_RUN:
            results_path = self._fn_result_path()
            # Pickled function result should be saved down to the Run's folder on the cluster
            if results_path not in self.folder.ls():
                raise FileNotFoundError(
                    f"No results saved down in path: {results_path}"
                )

        for fp in [self._stdout_path, self._stderr_path]:
            # Stdout and Stderr files created on a cluster can be symlinks to the files that we create via Ray
            # by default - before copying them to a new system make sure they are regular files
            self._convert_symlink_to_file(path=fp)

        if system == "here":
            # Save to default local path if none provided
            path = path or self._base_local_folder_path(self.name)

        new_run.folder = self.folder.to(
            system=system, path=path, data_config=data_config
        )

        return new_run

[docs]    def refresh(self) -> "Run":
        """Reload the Run object from the system. This is useful for checking the status of a Run.
        For example: ``my_run.refresh().status``"""
        run_config = self._load_run_config(folder=self.folder)
        # Need the metadata from RNS and the Run specific data in order to re-load the Run object
        config = {**self.config_for_rns, **run_config}
        return Run.from_config(config, dryrun=True)

[docs]    def inputs(self) -> bytes:
        """Load the pickled function inputs saved on the system for the Run."""
        return self._load_blob_from_path(path=self._fn_inputs_path()).fetch()

[docs]    def result(self):
        """Load the function result saved on the system for the Run. If the Run has failed return the stderr,
        otherwise return the stdout."""
        run_status = self.refresh().status
        if run_status == RunStatus.COMPLETED:
            results_path = self._fn_result_path()
            if results_path not in self.folder.ls():
                raise FileNotFoundError(
                    f"No results file found in path: {results_path}"
                )
            return self._load_blob_from_path(path=results_path).fetch()
        elif run_status == RunStatus.ERROR:
            logger.info("Run failed, returning stderr")
            return self.stderr()
        else:
            logger.info(f"Run status: {self.status}, returning stdout")
            return self.stdout()

[docs]    def stdout(self) -> str:
        """Read the stdout saved on the system for the Run."""
        stdout_path = self._stdout_path
        logger.info(f"Reading stdout from path: {stdout_path}")

        return self._load_blob_from_path(path=stdout_path).fetch().decode().strip()

[docs]    def stderr(self) -> str:
        """Read the stderr saved on the system for the Run."""
        stderr_path = self._stderr_path
        logger.info(f"Reading stderr from path: {stderr_path}")

        return self._load_blob_from_path(stderr_path).fetch().decode().strip()

    def _fn_inputs_path(self) -> str:
        """Path to the pickled inputs used for the function which are saved on the system."""
        return f"{self.folder.path}/{self.INPUTS_FILE}"

    def _fn_result_path(self) -> str:
        """Path to the pickled result for the function which are saved on the system."""
        return f"{self.folder.path}/{self.RESULT_FILE}"

    def _load_blob_from_path(self, path: str):
        """Load a blob from the Run's folder in the specified path. (ex: function inputs, result, stdout, stderr)."""
        return file(path=path, system=self.folder.system)

    def _register_new_run(self):
        """Log a Run once it's been triggered on the system."""
        self.start_time = self._current_timestamp()
        self.status = RunStatus.RUNNING

        # Write config data for the Run to its config file on the system
        logger.info(f"Registering new Run on system in path: {self.folder.path}")
        self._write_config()

    def _register_fn_run_completion(self, run_status: RunStatus):
        """Update a function based Run's config after its finished running on the system."""
        self.end_time = self._current_timestamp()
        self.status = run_status

        logger.info(f"Registering a completed fn Run with status: {run_status}")
        self._write_config()

    def _register_cmd_run_completion(self, return_codes: list):
        """Update a cmd based Run's config and register its stderr and stdout after running on the system."""
        run_status = RunStatus.ERROR if return_codes[0][0] != 0 else RunStatus.COMPLETED
        self.status = run_status

        logger.info(f"Registering a completed cmd Run with status: {run_status}")
        self._write_config()

        # Write the stdout and stderr of the commands Run to the Run's folder
        self.write(data=return_codes[0][1].encode(), path=self._stdout_path)
        self.write(data=return_codes[0][2].encode(), path=self._stderr_path)

    def _write_config(self, config: dict = None, overwrite: bool = True):
        """Write the Run's config data to the system.

        Args:
            config (Optional[Dict]): Config to write. If none is provided, the Run's config for RNS will be used.
            overwrite (Optional[bool]): Overwrite the config if one is already saved down. Defaults to ``True``.
        """
        config_to_write = config or self.config_for_rns
        logger.info(f"Config to save on system: {config_to_write}")
        self.folder.put(
            {self.RUN_CONFIG_FILE: config_to_write},
            overwrite=overwrite,
            mode="w",
            write_fn=lambda data, f: json.dump(data, f, indent=4),
        )

    def _detect_run_type(self):
        if self.fn_name:
            return RunType.FUNCTION_RUN
        elif self.cmds is not None:
            return RunType.CMD_RUN
        else:
            return RunType.CTX_MANAGER

    def _path_to_config(self) -> str:
        """Path the main folder storing the metadata, inputs, and results for the Run saved on the system."""
        return f"{self.folder.path}/{self.RUN_CONFIG_FILE}"

    def _path_to_file_by_ext(self, ext: str) -> str:
        """Path the file for the Run saved on the system for a provided extension (ex: ``.out`` or ``.err``)."""
        existing_file = self._find_file_path_by_ext(ext=ext)
        if existing_file:
            # If file already exists in file (ex: with function on a Ray cluster this will already be
            # generated for us)
            return existing_file

        path_to_ext = f"{self.folder.path}/{self.name}" + ext
        return path_to_ext

    def _convert_symlink_to_file(self, path: str):
        """If the system is a Cluster and the file path is a symlink, convert it to a regular file.
        This is necessary to allow for copying of the file between systems (ex: cluster --> s3 or cluster --> local)."""
        if isinstance(self.folder.system, Cluster):
            status_codes: list = self.folder.system.run(
                [f"test -h {path} && echo True || echo False"], stream_logs=True
            )
            if status_codes[0][1].strip() == "True":
                # If it's a symlink convert it to a regular file
                self.folder.system.run(
                    [f"cp --remove-destination `readlink {path}` {path}"]
                )

    @property
    def _stdout_path(self) -> str:
        """Path to the stdout file for the Run."""
        return self._path_to_file_by_ext(ext=".out")

    @property
    def _stderr_path(self) -> str:
        """Path to the stderr file for the Run."""
        return self._path_to_file_by_ext(ext=".err")

    def _find_file_path_by_ext(self, ext: str) -> Union[str, None]:
        """Get the file path by provided extension. Needed when loading the stdout and stderr files associated
        with a particular run."""
        try:
            folder_contents: list = self.folder.ls(sort=True)
        except FileNotFoundError:
            return None

        files_with_ext = self._filter_files_by_ext(folder_contents, ext)
        if not files_with_ext:
            # No .out / .err file already created in the logs folder for this Run
            return None

        # Return the most recent file with this extension
        return files_with_ext[0]

    def _register_upstream_artifact(self, artifact_name: str):
        """Track a Runhouse object loaded in the Run's context manager. This object's name
        will be saved to the upstream artifact registry of the Run's config."""
        if artifact_name not in self.upstream_artifacts:
            self.upstream_artifacts.append(artifact_name)

    def _register_downstream_artifact(self, artifact_name: str):
        """Track a Runhouse object saved in the Run's context manager. This object's name
        will be saved to the downstream artifact registry of the Run's config."""
        if artifact_name not in self.downstream_artifacts:
            self.downstream_artifacts.append(artifact_name)

    @staticmethod
    def _current_timestamp():
        return str(log_timestamp())

    @staticmethod
    def _filter_files_by_ext(files: list, ext: str):
        return list(filter(lambda x: x.endswith(ext), files))

    @staticmethod
    def _delete_existing_run(folder_path, folder_system: str):
        """Delete existing Run on the system before a new one is created."""
        existing_folder = folder_factory(
            path=folder_path,
            system=folder_system,
        )

        existing_folder.rm(recursive=True)

    @staticmethod
    def _load_run_config(folder: Folder) -> dict:
        """Load the Run config file saved for the Run in its dedicated folder on the system ."""
        try:
            return json.loads(folder.get(Run.RUN_CONFIG_FILE))
        except FileNotFoundError:
            return {}

    @staticmethod
    def _base_cluster_folder_path(name: str):
        """Path to the base folder for this Run on a cluster."""
        return f"{obj_store.LOGS_DIR}/{name}"

    @staticmethod
    def _base_local_folder_path(name: str):
        """Path to the base folder for this Run on a local system."""
        return f"{obj_store.LOGS_DIR}/{name}"


class StreamTee(object):
    def __init__(self, instream, outstreams):
        self.instream = instream
        self.outstreams = outstreams

    def write(self, message):
        self.instream.write(message)
        for stream in self.outstreams:
            if message:
                stream.write(message)

    def flush(self):
        self.instream.flush()
        for stream in self.outstreams:
            stream.flush()


class capture_stdout:
    """Context manager for capturing stdout to a file, list, or stream, while still printing to stdout."""

    def __init__(self, output=None):
        self.output = output
        self._stream = None

    def __enter__(self):
        if self.output is None:
            self.output = StringIO()

        if isinstance(self.output, str):
            self._stream = open(self.output, "w")
        else:
            self._stream = self.output
        sys.stdout = StreamTee(sys.stdout, [self])
        return self

    def write(self, message):
        self._stream.write(message)

    def flush(self):
        self._stream.flush()

    @property
    def stream(self):
        if isinstance(self.output, str):
            return open(self.output, "r")
        return self._stream

    def list(self):
        if isinstance(self.output, str):
            return self.stream.readlines()
        return (self.stream.getvalue() or "").splitlines()

    def __str__(self):
        return self.stream.getvalue()

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = sys.stdout.instream
        return False


[docs]def run(
    name: str = None,
    log_dest: str = "file",
    path: str = None,
    system: Union[str, Cluster] = None,
    data_config: dict = None,
    load: bool = True,
    dryrun: bool = False,
    **kwargs,
) -> Union["Run", None]:
    """Constructs a Run object.

    Args:
        name (Optional[str]): Name of the Run to load.
        log_dest (Optional[str]): Whether to save the Run's logs to a file or stream them back. (Default: ``file``)
        path (Optional[str]): Path to the Run's dedicated folder on the system where the Run lives.
        system (Optional[str or Cluster]): File system or cluster name where the Run lives.
            If providing a file system this must be one of:
            [``file``, ``github``, ``sftp``, ``ssh``, ``s3``, ``gs``, ``azure``].
            We are working to add additional file system support.
        data_config (Optional[Dict]): The data config to pass to the underlying fsspec handler for the folder.
        load (bool): Whether to try reloading an existing Run from configs. (Default: ``True``)
        dryrun (bool): Whether to create the Blob if it doesn't exist, or load a Blob object as a dryrun.
            (Default: ``False``)
        **kwargs: Optional kwargs for the Run.

    Returns:
        Run: The loaded Run object.
    """
    if name and load and not any([path, system, data_config, kwargs]):
        # Try reloading existing Run from RNS
        return Run.from_name(name, dryrun=dryrun)

    if name and path is None and log_dest == "file":
        path = (
            Run._base_cluster_folder_path(name=name)
            if isinstance(system, Cluster)
            else Run._base_local_folder_path(name=name)
        )

    system = _get_cluster_from(
        system or _current_cluster(key="config") or Folder.DEFAULT_FS, dryrun=dryrun
    )

    run_obj = Run(
        name=name,
        log_dest=log_dest,
        path=path,
        system=system,
        data_config=data_config,
        dryrun=dryrun,
        **kwargs,
    )

    return run_obj