You are viewing v0.0.12 version. Click here to see docs for the latest stable version.

Source code for runhouse.resources.tables.table

import copy
import logging
from pathlib import Path
from typing import Dict, List, Optional

import fsspec
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import ray
import ray.data

from runhouse import Folder
from runhouse.globals import rns_client

from runhouse.resources.folders import folder
from runhouse.resources.hardware.utils import _current_cluster, _get_cluster_from
from runhouse.resources.resource import Resource

PREFETCH_KWARG = "prefetch_batches" if ray.__version__ >= "2.4.0" else "prefetch_blocks"

logger = logging.getLogger(__name__)


[docs]class Table(Resource): RESOURCE_TYPE = "table" DEFAULT_FOLDER_PATH = "/runhouse-table" DEFAULT_CACHE_FOLDER = ".cache/runhouse/tables" DEFAULT_STREAM_FORMAT = "pyarrow" DEFAULT_BATCH_SIZE = 256 DEFAULT_PREFETCH_BATCHES = 1
[docs] def __init__( self, path: str, name: Optional[str] = None, file_name: Optional[str] = None, system: Optional[str] = None, data_config: Optional[dict] = None, dryrun: bool = False, partition_cols: Optional[List] = None, stream_format: Optional[str] = None, metadata: Optional[Dict] = None, **kwargs, ): """ The Runhouse Table object. .. note:: To build a Table, please use the factory method :func:`table`. """ super().__init__(name=name, dryrun=dryrun) self.file_name = file_name # Use factory method so correct subclass for system is returned # strip filename from path if provided self._folder = folder( path=str(Path(path).parents[0]) if Path(path).suffix else path, system=system, data_config=data_config, dryrun=dryrun, ) self._cached_data = None self.partition_cols = partition_cols self.stream_format = stream_format or self.DEFAULT_STREAM_FORMAT self.metadata = metadata or {}
@staticmethod def from_config(config: dict, dryrun=False): if isinstance(config["system"], dict): config["system"] = _get_cluster_from(config["system"], dryrun=dryrun) return _load_table_subclass(config, dryrun=dryrun) @property def config_for_rns(self): config = super().config_for_rns if isinstance(self._folder, Resource): config["system"] = self._resource_string_for_subconfig(self.system) config["data_config"] = self._folder._data_config else: config["system"] = self.system self.save_attrs_to_config( config, ["path", "partition_cols", "metadata", "file_name"] ) config.update(config) return config @property def data(self) -> ray.data.Dataset: """Get the table data. If data is not already cached, return a Ray dataset. With the dataset object we can stream or convert to other types, for example: .. code-block:: python data.iter_batches() data.to_pandas() data.to_dask() """ if self._cached_data is not None: return self._cached_data return self._read_ray_dataset() @data.setter def data(self, new_data): """Update the data blob to new data""" self._cached_data = new_data # TODO should we save here? # self.save(overwrite=True) @property def system(self): return self._folder.system @system.setter def system(self, new_system): self._folder.system = new_system @property def path(self): if self.file_name: return f"{self._folder.path}/{self.file_name}" return self._folder.path @path.setter def path(self, new_path): self._folder.path = new_path def set_metadata(self, key, val): self.metadata[key] = val def get_metadata(self, key): return self.metadata.get(key) @property def fsspec_url(self): if self.file_name: return f"{self._folder.fsspec_url}/{self.file_name}" return self._folder.fsspec_url @property def data_config(self): return self._folder.data_config @data_config.setter def data_config(self, new_data_config): self._folder.data_config = new_data_config # @classmethod # def from_name(cls, name, dryrun=False): # """Load existing Table via its name.""" # config = rns_client.load_config(name=name) # if not config: # raise ValueError(f"Table {name} not found.") # # # We don't need to load the cluster dict here (if system is a cluster) because the table init # # goes through the Folder factory method, which handles that. # # # Add this table's name to the resource artifact registry if part of a run # rns_client.add_upstream_resource(name) # # # Uses the table subclass associated with the `resource_subtype` # table_cls = _load_table_subclass(config=config, dryrun=dryrun) # return table_cls.from_config(config=config, dryrun=dryrun)
[docs] def to(self, system, path=None, data_config=None): """Copy and return the table on the given filesystem and path. Example: >>> local_table = rh.table(data, path="local/path") >>> s3_table = local_table.to("s3") >>> cluster_table = local_table.to(my_cluster) """ new_table = copy.copy(self) new_table._folder = self._folder.to( system=system, path=path, data_config=data_config ) return new_table
def _save_sub_resources(self): if isinstance(self.system, Resource): self.system.save()
[docs] def write(self): """Write underlying table data to fsspec URL. Example: >>> rh.table(data, path="path/to/write").write() """ if self._cached_data is not None: data_to_write = self.data if isinstance(data_to_write, pd.DataFrame): data_to_write = self._ray_dataset_from_pandas(data_to_write) if isinstance(data_to_write, pa.Table): data_to_write = self._ray_dataset_from_arrow(data_to_write) if not isinstance(data_to_write, ray.data.Dataset): raise TypeError(f"Invalid Table format {type(data_to_write)}") self._write_ray_dataset(data_to_write) logger.info(f"Saved {str(self)} to: {self.fsspec_url}") return self
[docs] def fetch(self, columns: Optional[list] = None) -> pa.Table: """Returns the complete table contents. Example: >>> table = rh.table(data) >>> fomratted_data = table.fetch() """ # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html self._cached_data = self.read_table_from_file(columns) if self._cached_data is not None: return self._cached_data # When trying to read a file like object could lead to IsADirectoryError if the folder path is actually a # directory and the file has been automatically generated for us inside the folder # (ex: with pyarrow table or with partitioned data that saves multiple files within the directory) try: table_data = pq.read_table( self.path, columns=columns, filesystem=self._folder.fsspec_fs ) if not table_data: raise ValueError(f"No table data found in path: {self.path}") self._cached_data = table_data return self._cached_data except: raise Exception(f"Failed to read table in path: {self.path}")
def __getstate__(self): """Override the pickle method to clear _cached_data before pickling.""" state = self.__dict__.copy() state["_cached_data"] = None return state def __iter__(self): for block in self.stream(batch_size=self.DEFAULT_BATCH_SIZE): for sample in block: yield sample def __len__(self): if isinstance(self.data, pd.DataFrame): len_dataset = self.data.shape[0] elif isinstance(self.data, ray.data.Dataset): len_dataset = self.data.count() else: if not hasattr(self.data, "__len__") or not self.data: raise RuntimeError("Cannot get len for dataset.") else: len_dataset = len(self.data) return len_dataset def __str__(self): return self.__class__.__name__
[docs] def stream( self, batch_size: int, drop_last: bool = False, shuffle_seed: Optional[int] = None, shuffle_buffer_size: Optional[int] = None, prefetch_batches: Optional[int] = None, ): """Return a local batched iterator over the ray dataset. Example: >>> table = rh.table(data) >>> batches = table.stream(batch_size=4) >>> for _, batch in batches: >>> print(batch) """ ray_data = self.data if self.stream_format == "torch": # https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.iter_torch_batches.html#ray.data.Dataset.iter_torch_batches return ray_data.iter_torch_batches( batch_size=batch_size, drop_last=drop_last, local_shuffle_buffer_size=shuffle_buffer_size, local_shuffle_seed=shuffle_seed, # We need to do this to handle the name change of the prefetch_batches argument in ray 2.4.0 **{PREFETCH_KWARG: prefetch_batches or self.DEFAULT_PREFETCH_BATCHES}, ) elif self.stream_format == "tf": # https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.iter_tf_batches.html return ray_data.iter_tf_batches( batch_size=batch_size, drop_last=drop_last, local_shuffle_buffer_size=shuffle_buffer_size, local_shuffle_seed=shuffle_seed, # We need to do this to handle the name change of the prefetch_batches argument in ray 2.4.0 **{PREFETCH_KWARG: prefetch_batches or self.DEFAULT_PREFETCH_BATCHES}, ) else: # https://docs.ray.io/en/latest/data/api/dataset.html#ray.data.Dataset.iter_batches return ray_data.iter_batches( batch_size=batch_size, batch_format=self.stream_format, drop_last=drop_last, local_shuffle_buffer_size=shuffle_buffer_size, local_shuffle_seed=shuffle_seed, # We need to do this to handle the name change of the prefetch_batches argument in ray 2.4.0 **{PREFETCH_KWARG: prefetch_batches or self.DEFAULT_PREFETCH_BATCHES}, )
def _read_ray_dataset(self, columns: Optional[List[str]] = None): """Read parquet data as a ray dataset object.""" # https://docs.ray.io/en/latest/data/api/input_output.html#parquet dataset = ray.data.read_parquet( self.fsspec_url, columns=columns, filesystem=self._folder.fsspec_fs ) return dataset def _write_ray_dataset(self, data_to_write: ray.data.Dataset): """Write a ray dataset to a fsspec filesystem""" if self.partition_cols: # TODO [JL]: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_to_dataset.html logger.warning("Partitioning by column not currently supported.") pass # delete existing contents or they'll just be appended to self.rm() # https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.write_parquet.html # data_to_write.repartition(os.cpu_count() * 2).write_parquet( data_to_write.write_parquet(self.fsspec_url, filesystem=self._folder.fsspec_fs) @staticmethod def _ray_dataset_from_arrow(data: pa.Table): """Convert an Arrow Table to a Ray Dataset""" return ray.data.from_arrow(data) @staticmethod def _ray_dataset_from_pandas(data: pd.DataFrame): """Convert an Pandas DataFrame to a Ray Dataset""" return ray.data.from_pandas(data)
[docs] def read_table_from_file(self, columns: Optional[list] = None): """Read a table from it's path. Example: >>> table = rh.table(path="path/to/table") >>> table_data = table.read_table_from_file() """ try: with fsspec.open(self.fsspec_url, mode="rb", **self.data_config) as t: table_data = pq.read_table(t.full_name, columns=columns) return table_data except: return None
[docs] def rm(self, recursive: bool = True): """Delete table, including its partitioned files where relevant. Example: >>> table = rh.table(path="path/to/table") >>> table.rm() """ # https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.rm self._folder.rm(recursive=recursive)
[docs] def exists_in_system(self): """Whether the table exists in file system. Example: >>> table.exists_in_system() """ return ( self._folder.exists_in_system() and len(self._folder.ls(self.fsspec_url)) >= 1 )
def _load_table_subclass(config: dict, dryrun: bool, data=None): """Load the relevant Table subclass based on the config or data type provided""" resource_subtype = config.get("resource_subtype", Table.__name__) try: import datasets if resource_subtype == "HuggingFaceTable" or isinstance(data, datasets.Dataset): from .huggingface_table import HuggingFaceTable return HuggingFaceTable(**config, dryrun=dryrun) except ModuleNotFoundError: pass except Exception as e: raise e try: if resource_subtype == "PandasTable" or isinstance(data, pd.DataFrame): from .pandas_table import PandasTable return PandasTable(**config, dryrun=dryrun) except ModuleNotFoundError: pass except Exception as e: raise e try: import dask.dataframe as dd if resource_subtype == "DaskTable" or isinstance(data, dd.DataFrame): from .dask_table import DaskTable return DaskTable(**config, dryrun=dryrun) except ModuleNotFoundError: pass except Exception as e: raise e try: import ray if resource_subtype == "RayTable" or isinstance(data, ray.data.Dataset): from .ray_table import RayTable return RayTable(**config, dryrun=dryrun) except ModuleNotFoundError: pass except Exception as e: raise e try: import cudf if resource_subtype == "CudfTable" or isinstance(data, cudf.DataFrame): raise NotImplementedError("Cudf not currently supported") except ModuleNotFoundError: pass except Exception as e: raise e if resource_subtype == "Table" or isinstance(data, pa.Table): new_table = Table(**config, dryrun=dryrun) return new_table else: raise TypeError( f"Unsupported data type {type(data)} for Table construction. " f"For converting data to pyarrow see: " f"https://arrow.apache.org/docs/7.0/python/generated/pyarrow.Table.html" )
[docs]def table( data=None, name: Optional[str] = None, path: Optional[str] = None, system: Optional[str] = None, data_config: Optional[dict] = None, partition_cols: Optional[list] = None, mkdir: bool = False, dryrun: bool = False, stream_format: Optional[str] = None, metadata: Optional[dict] = None, ) -> Table: """Constructs a Table object, which can be used to interact with the table at the given path. Args: data: Data to be stored in the table. name (Optional[str]): Name for the table, to reuse it later on. path (Optional[str]): Full path to the data file. system (Optional[str]): File system. Currently this must be one of: [``file``, ``github``, ``sftp``, ``ssh``, ``s3``, ``gs``, ``azure``]. data_config (Optional[dict]): The data config to pass to the underlying fsspec handler. partition_cols (Optional[list]): List of columns to partition the table by. mkdir (bool): Whether to create a remote folder for the table. (Default: ``False``) dryrun (bool): Whether to create the Table if it doesn't exist, or load a Table object as a dryrun. (Default: ``False``) stream_format (Optional[str]): Format to stream the Table as. Currently this must be one of: [``pyarrow``, ``torch``, ``tf``, ``pandas``] metadata (Optional[dict]): Metadata to store for the table. Returns: Table: The resulting Table object. Example: >>> import runhouse as rh >>> # Create and save (pandas) table >>> rh.table( >>> data=data, >>> name="~/my_test_pandas_table", >>> path="table_tests/test_pandas_table.parquet", >>> system="file", >>> mkdir=True, >>> ).save() >>> >>> # Load table from above >>> reloaded_table = rh.table(name="~/my_test_pandas_table") """ if name and not any( [ data is not None, path, system, data_config, partition_cols, stream_format, metadata, ] ): # Try reloading existing table return Table.from_name(name, dryrun) system = _get_cluster_from( system or _current_cluster(key="config") or Folder.DEFAULT_FS, dryrun=dryrun ) file_name = None if path: # Extract the file name from the path if provided full_path = Path(path) file_suffix = full_path.suffix if file_suffix: path = str(full_path.parent) file_name = full_path.name if path is None: # If no path is provided we need to create one based on the name of the table table_name_in_path = rns_client.resolve_rns_data_resource_name(name) if system == rns_client.DEFAULT_FS or ( isinstance(system, Resource) and system.on_this_cluster() ): # create random path to store in .cache folder of local filesystem path = str( Path( f"~/{Table.DEFAULT_CACHE_FOLDER}/{table_name_in_path}" ).expanduser() ) else: # save to the default bucket path = f"{Table.DEFAULT_FOLDER_PATH}/{table_name_in_path}" config = { "system": system, "name": name, "path": path, "file_name": file_name, "data_config": data_config, "partition_cols": partition_cols, "stream_format": stream_format, "metadata": metadata, } new_table = _load_table_subclass(config=config, dryrun=dryrun, data=data) if data is not None: new_table.data = data return new_table