From 965abd81b46cd712ed2ac4afd5a55b4cbebb3d36 Mon Sep 17 00:00:00 2001 From: Stanislas0 Date: Tue, 13 Dec 2022 05:36:17 +0000 Subject: [PATCH] Update data processing --- codegeex/data/process_pretrain_dataset.py | 2 +- codegeex/data/processor.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/codegeex/data/process_pretrain_dataset.py b/codegeex/data/process_pretrain_dataset.py index 4942266..8d9f03b 100644 --- a/codegeex/data/process_pretrain_dataset.py +++ b/codegeex/data/process_pretrain_dataset.py @@ -58,7 +58,7 @@ def process_sample( try: if language is not None and language in LANGUAGE_TAG.keys(): - code = LANGUAGE_TAG[language] + sample["code"] + code = LANGUAGE_TAG[language] + "\n" + sample["code"] else: code = sample["code"] except Exception as e: diff --git a/codegeex/data/processor.py b/codegeex/data/processor.py index 4ea507f..25775da 100644 --- a/codegeex/data/processor.py +++ b/codegeex/data/processor.py @@ -67,6 +67,9 @@ class PromptDatasetProcessor(object): """ Instead of processing lazily, we turn the iterable into a list. """ + if sample is None: + return None + return list(self.process_sample(sample)) def process_sample_(self, sample) -> List[Dict[str, List[int]]]: @@ -141,6 +144,9 @@ class LabelDatasetProcessor(object): """ Instead of processing lazily, we turn the iterable into a list. """ + if sample is None: + return None + return list(self.process_sample(sample)) def process_sample_(self, sample) -> List[Dict[str, List[int]]]: