def__getdataset__(self,split):""" Get the dataset for the specified split. Args: split (str): The split to get the dataset for. Returns: dataset: The dataset for the specified split. """config=self.configregistered_process_mapping={k.replace("/","_"):kfork,vinregistered_process.items()}config.set_default_section(f"core/dataset/ast")_iterable=config.getoption("iterable",False)_template=config.getoption("template","csv")_data_name=config.getoption("data_name",None)_config_name=config.getoption("config_name",None)_data_dir=config.getoption("data_dir",None)_data_files=config.getoption("data_files",None)_names=config.getoption("names",None)_features=config.getoption("features",None)_sep=config.getoption("sep","\t")_quoting=config.getoption("quoting",3)_escapechar=config.getoption("escapechar",None)_field=config.getoption("field",None)_process_functions=config.getoption("preprocess_functions",None)_enable_ddp_partition=config.getoption("enable_ddp_partition",True)_HFDatasets=HFIterableDatasetsif_iterableelseHFDatasets_ASTDatasets=ASTHFIterableDatasetsif_iterableelseASTHFDatasetsconfig.set_default_section(f"core/dataset/ast/{split}")template=config.getoption("template",_template)ifconfig.getoption("data_name",_data_name)isnotNone:template="hub"asserttemplateinself.templatesnew_split="validation"ifsplit=="dev"elsesplitnew_split=config.getoption("split",new_split)# get datasetdataset=Noneiftemplate=="csv":data_dir=config.getoption("data_dir",_data_dir)data_files=config.getoption("data_files",_data_files)names=config.getoption("names",_names)sep=config.getoption("sep",_sep)quoting=config.getoption("quoting",_quoting)escapechar=config.getoption("escapechar",_escapechar)dataset=_HFDatasets.from_csv(data_dir=data_dir,data_files=data_files,names=names,sep=sep,quoting=quoting,escapechar=escapechar,split=new_split,)iftemplate=="json":data_dir=config.getoption("data_dir",_data_dir)data_files=config.getoption("data_files",_data_files)field=config.getoption("field",_field)dataset=_HFDatasets.from_json(data_dir=data_dir,data_files=data_files,field=field,split=new_split,)iftemplate=="parquet":data_dir=config.getoption("data_dir",_data_dir)data_files=config.getoption("data_files",_data_files)features=config.getoption("features",_features)ifisinstance(features,str):features=eval(features)dataset=_HFDatasets.from_parquet(data_dir=data_dir,data_files=data_files,split=new_split,features=features,)iftemplate=="hub":data_name=config.getoption("data_name",_data_name)config_name=config.getoption("config_name",_config_name)data_dir=config.getoption("data_dir",_data_dir)data_files=config.getoption("data_files",_data_files)data_name=(cached_path(data_name)ifdata_name.endswith(".py")elsedata_name)dataset=_HFDatasets.from_hub(data_name=data_name,config_name=config_name,data_dir=data_dir,data_files=data_files,split=new_split,)assertdatasetisnotNone# get process functionsprocess_functions=config.getoption("preprocess_functions",_process_functions)ifprocess_functionsisNone:process_functions=[]else:process_functions=[ASTFunction(func)forfuncinprocess_functions]forpfuncinprocess_functions:fornameinpfunc.__ast_process__:globals()[name]=init_registered_process(registered_process_mapping[name],config,)enable_ddp_partition=config.getoption("enable_ddp_partition",_enable_ddp_partition)ifisinstance(_ASTDatasets,HFIterableDatasets):self.__ASTDatasets__[split]=_ASTDatasets(dataset=dataset.dataset,process_functions=process_functions,enable_ddp_partition=enable_ddp_partition,)else:self.__ASTDatasets__[split]=_ASTDatasets(dataset=dataset.dataset,process_functions=process_functions,)returnself.__ASTDatasets__.get(split)
Tip
core/dataset/ast/train is the section for configuration of ASTDatasets Training Data.
Tip
core/dataset/ast/dev is the section for configuration of ASTDatasets Validation Data.
Tip
core/dataset/ast/test is the section for configuration of ASTDatasets Test Data.