unitorch.cli.datasets.hf¤

ASTDatasets¤

Tip

core/dataset/ast is the section for configuration of ASTDatasets.

Get the dataset for the specified split.

Parameters:

Name	Type	Description	Default
`split`	`str`	The split to get the dataset for.	required

Returns:

Name	Type	Description
`dataset`		The dataset for the specified split.

Source code in src/unitorch/cli/datasets/hf.py

def __getdataset__(self, split):
    """
    Get the dataset for the specified split.

    Args:
        split (str): The split to get the dataset for.

    Returns:
        dataset: The dataset for the specified split.
    """
    config = self.config

    registered_process_mapping = {
        k.replace("/", "_"): k for k, v in registered_process.items()
    }

    config.set_default_section(f"core/dataset/ast")
    _iterable = config.getoption("iterable", False)
    _template = config.getoption("template", "csv")
    _data_name = config.getoption("data_name", None)
    _config_name = config.getoption("config_name", None)
    _data_dir = config.getoption("data_dir", None)
    _data_files = config.getoption("data_files", None)
    _names = config.getoption("names", None)
    _features = config.getoption("features", None)
    _sep = config.getoption("sep", "\t")
    _quoting = config.getoption("quoting", 3)
    _escapechar = config.getoption("escapechar", None)
    _field = config.getoption("field", None)
    _process_functions = config.getoption("preprocess_functions", None)
    _enable_ddp_partition = config.getoption("enable_ddp_partition", True)

    _HFDatasets = HFIterableDatasets if _iterable else HFDatasets
    _ASTDatasets = ASTHFIterableDatasets if _iterable else ASTHFDatasets

    config.set_default_section(f"core/dataset/ast/{split}")

    template = config.getoption("template", _template)
    if config.getoption("data_name", _data_name) is not None:
        template = "hub"

    assert template in self.templates

    new_split = "validation" if split == "dev" else split
    new_split = config.getoption("split", new_split)

    # get dataset
    dataset = None
    if template == "csv":
        data_dir = config.getoption("data_dir", _data_dir)
        data_files = config.getoption("data_files", _data_files)
        names = config.getoption("names", _names)
        sep = config.getoption("sep", _sep)
        quoting = config.getoption("quoting", _quoting)
        escapechar = config.getoption("escapechar", _escapechar)
        dataset = _HFDatasets.from_csv(
            data_dir=data_dir,
            data_files=data_files,
            names=names,
            sep=sep,
            quoting=quoting,
            escapechar=escapechar,
            split=new_split,
        )

    if template == "json":
        data_dir = config.getoption("data_dir", _data_dir)
        data_files = config.getoption("data_files", _data_files)
        field = config.getoption("field", _field)

        dataset = _HFDatasets.from_json(
            data_dir=data_dir,
            data_files=data_files,
            field=field,
            split=new_split,
        )

    if template == "parquet":
        data_dir = config.getoption("data_dir", _data_dir)
        data_files = config.getoption("data_files", _data_files)
        features = config.getoption("features", _features)
        if isinstance(features, str):
            features = eval(features)
        dataset = _HFDatasets.from_parquet(
            data_dir=data_dir,
            data_files=data_files,
            split=new_split,
            features=features,
        )

    if template == "hub":
        data_name = config.getoption("data_name", _data_name)
        config_name = config.getoption("config_name", _config_name)
        data_dir = config.getoption("data_dir", _data_dir)
        data_files = config.getoption("data_files", _data_files)
        data_name = (
            cached_path(data_name) if data_name.endswith(".py") else data_name
        )
        dataset = _HFDatasets.from_hub(
            data_name=data_name,
            config_name=config_name,
            data_dir=data_dir,
            data_files=data_files,
            split=new_split,
        )

    assert dataset is not None

    # get process functions
    process_functions = config.getoption("preprocess_functions", _process_functions)
    if process_functions is None:
        process_functions = []
    else:
        process_functions = [ASTFunction(func) for func in process_functions]

    for pfunc in process_functions:
        for name in pfunc.__ast_process__:
            globals()[name] = init_registered_process(
                registered_process_mapping[name],
                config,
            )

    enable_ddp_partition = config.getoption(
        "enable_ddp_partition", _enable_ddp_partition
    )

    if isinstance(_ASTDatasets, HFIterableDatasets):
        self.__ASTDatasets__[split] = _ASTDatasets(
            dataset=dataset.dataset,
            process_functions=process_functions,
            enable_ddp_partition=enable_ddp_partition,
        )
    else:
        self.__ASTDatasets__[split] = _ASTDatasets(
            dataset=dataset.dataset,
            process_functions=process_functions,
        )

    return self.__ASTDatasets__.get(split)

Tip

core/dataset/ast/train is the section for configuration of ASTDatasets Training Data.

Tip

core/dataset/ast/dev is the section for configuration of ASTDatasets Validation Data.

Tip

core/dataset/ast/test is the section for configuration of ASTDatasets Test Data.