Skip to content

unitorch.cli.writer¤

GeneralJsonlWriter¤

Tip

core/writer/jsonl is the section for configuration of GeneralJsonlWriter.

Bases: GenericWriter

Class for writing data in JSONL format.

Initialize GeneralJsonlWriter.

Parameters:

Name Type Description Default
output_file str

The path to the output file.

required
nrows_per_sample int

The number of rows per sample. Defaults to None.

None
header bool

Whether to include a header in the output file. Defaults to None.

None
columns List[str]

The list of columns to include in the output file. Defaults to None.

None
Source code in src/unitorch/cli/writers/__init__.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(
    self,
    output_file: str,
    nrows_per_sample: Optional[int] = None,
    header: Optional[bool] = None,
    columns: Optional[List[str]] = None,
):
    """
    Initialize GeneralJsonlWriter.

    Args:
        output_file (str): The path to the output file.
        nrows_per_sample (int, optional): The number of rows per sample. Defaults to None.
        header (bool, optional): Whether to include a header in the output file. Defaults to None.
        columns (List[str], optional): The list of columns to include in the output file. Defaults to None.
    """
    self.header = header
    self.columns = columns
    self.skip_n_samples = (
        0
        if nrows_per_sample is None or not os.path.exists(output_file)
        else sum(1 for _ in open(output_file)) // nrows_per_sample
    )
    if self.skip_n_samples == 0:
        self.output_file = open(output_file, "w", encoding="utf-8")
    else:
        self.output_file = open(output_file, "a", encoding="utf-8")

from_core_configure classmethod ¤

from_core_configure(config, **kwargs)

Create an instance of GeneralJsonlWriter from a core configuration.

Parameters:

Name Type Description Default
config

The core configuration.

required
**kwargs

Additional keyword arguments.

{}

Returns:

Name Type Description
GeneralJsonlWriter

An instance of GeneralJsonlWriter.

Source code in src/unitorch/cli/writers/__init__.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@classmethod
@add_default_section_for_init("core/writer/jsonl")
def from_core_configure(cls, config, **kwargs):
    """
    Create an instance of GeneralJsonlWriter from a core configuration.

    Args:
        config: The core configuration.
        **kwargs: Additional keyword arguments.

    Returns:
        GeneralJsonlWriter: An instance of GeneralJsonlWriter.
    """
    pass

process_chunk ¤

process_chunk(outputs: WriterOutputs)

Process a chunk of data during the writing process.

Parameters:

Name Type Description Default
outputs WriterOutputs

The writer outputs.

required
Source code in src/unitorch/cli/writers/__init__.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def process_chunk(self, outputs: WriterOutputs):
    """
    Process a chunk of data during the writing process.

    Args:
        outputs (WriterOutputs): The writer outputs.
    """
    dataframe = outputs.to_pandas()
    if self.columns is not None:
        columns = set(dataframe.columns)
        dataframe = dataframe[[h for h in self.columns if h in columns]]
    string = dataframe.to_json(orient="records", lines=True)
    self.output_file.write(string)
    self.output_file.flush()

process_end ¤

process_end()

Process the end of the writing process.

Source code in src/unitorch/cli/writers/__init__.py
97
98
99
def process_end(self):
    """Process the end of the writing process."""
    self.output_file.close()

process_start ¤

process_start(outputs: WriterOutputs)

Process the start of the writing process.

Parameters:

Name Type Description Default
outputs WriterOutputs

The writer outputs.

required
Source code in src/unitorch/cli/writers/__init__.py
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def process_start(self, outputs: WriterOutputs):
    """
    Process the start of the writing process.

    Args:
        outputs (WriterOutputs): The writer outputs.
    """
    dataframe = outputs.to_pandas()
    if self.columns is not None:
        columns = set(dataframe.columns)
        dataframe = dataframe[[h for h in self.columns if h in columns]]
    string = dataframe.to_json(orient="records", lines=True)
    self.output_file.write(string)
    self.output_file.flush()