From 14712c90cee6a7f07ef3cff9c78cde2b6eb7ace2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:07:10 +0100 Subject: [PATCH 01/18] add NoisyNER dataset class --- flair/datasets/sequence_labeling.py | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 9652070521..d04d665700 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4839,6 +4839,59 @@ def __init__( ) +class NoisyNER_EST_Clean(ColumnCorpus): + data_url = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip" + + def __init__( + self, + base_path: Optional[Union[str, Path]] = None, + column_format={0: "text", 3: "ner"}, + in_memory: bool = True, + **corpusargs, + ) -> None: + + data_folder, instances = self._load_data(base_path) + + train, dev, test = self._split_data(instances) + + self._write_instances(train, data_folder/"estner_clean_train.tsv") + self._write_instances(dev, data_folder/"estner_clean_dev.tsv") + self._write_instances(test, data_folder/"estner_clean_test.tsv") + + super().__init__( + data_folder, + column_format=column_format, + in_memory=in_memory, + **corpusargs, + ) + + @classmethod + def _load_data(cls, base_path) -> tuple[Path, list[str]]: + base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) + data_folder = base_path/"estner_clean" + unpack_file(cached_path(cls.data_url, data_folder), data_folder, "zip", False) + with open(data_folder/"estner.cnll") as in_file: + instances = in_file.readlines() + instances = [instance.strip().split("\t") for instance in instances] + return data_folder, instances + + @classmethod + def _split_data(cls, instances) -> tuple[list[str], list[str], list[str]]: + train = instances[:185708] + dev = instances[185708:208922] + test = instances[208922:] + return train, dev, test + + @classmethod + def _write_instances(cls, instances, filepath): + # CoNLL format + column_separator = "\t" + with open(filepath, "w") as out_file: + for instance in instances: + out_file.write(column_separator.join(instance)) + out_file.write("\n") + + class MASAKHA_POS(MultiCorpus): def __init__( self, From 4e9783d3ea6d4043bfd9ed7916d1049aa69ea268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Thu, 14 Mar 2024 18:15:25 +0100 Subject: [PATCH 02/18] fix: column format in files --- flair/datasets/sequence_labeling.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index d04d665700..f4d29b6455 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4845,15 +4845,15 @@ class NoisyNER_EST_Clean(ColumnCorpus): def __init__( self, base_path: Optional[Union[str, Path]] = None, - column_format={0: "text", 3: "ner"}, + column_format={0: "text", 1: "ner"}, in_memory: bool = True, **corpusargs, ) -> None: - + data_folder, instances = self._load_data(base_path) - + train, dev, test = self._split_data(instances) - + self._write_instances(train, data_folder/"estner_clean_train.tsv") self._write_instances(dev, data_folder/"estner_clean_dev.tsv") self._write_instances(test, data_folder/"estner_clean_test.tsv") @@ -4873,6 +4873,7 @@ def _load_data(cls, base_path) -> tuple[Path, list[str]]: with open(data_folder/"estner.cnll") as in_file: instances = in_file.readlines() instances = [instance.strip().split("\t") for instance in instances] + instances = [[instance[0], instance[len(instance)-1]] for instance in instances] return data_folder, instances @classmethod @@ -4884,12 +4885,12 @@ def _split_data(cls, instances) -> tuple[list[str], list[str], list[str]]: @classmethod def _write_instances(cls, instances, filepath): - # CoNLL format - column_separator = "\t" - with open(filepath, "w") as out_file: - for instance in instances: - out_file.write(column_separator.join(instance)) - out_file.write("\n") + # CoNLL format + column_separator = "\t" + with open(filepath, "w") as out_file: + for instance in instances: + out_file.write(column_separator.join(instance)) + out_file.write("\n") class MASAKHA_POS(MultiCorpus): From ce83274956b9adecdee08e5a933abdae99d79d37 Mon Sep 17 00:00:00 2001 From: elenamer Date: Mon, 18 Mar 2024 15:02:43 +0100 Subject: [PATCH 03/18] fix column delimiter --- flair/datasets/sequence_labeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index f4d29b6455..8ccb03393e 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4862,6 +4862,7 @@ def __init__( data_folder, column_format=column_format, in_memory=in_memory, + column_delimiter = '\t', **corpusargs, ) From 5e66069d0a61b8cb9882ab9dfc6aa7165f60ea62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:09:10 +0100 Subject: [PATCH 04/18] fix: skip empty labels --- flair/datasets/sequence_labeling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 8ccb03393e..50a0c2f2e6 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4872,9 +4872,13 @@ def _load_data(cls, base_path) -> tuple[Path, list[str]]: data_folder = base_path/"estner_clean" unpack_file(cached_path(cls.data_url, data_folder), data_folder, "zip", False) with open(data_folder/"estner.cnll") as in_file: - instances = in_file.readlines() - instances = [instance.strip().split("\t") for instance in instances] - instances = [[instance[0], instance[len(instance)-1]] for instance in instances] + preinstances = in_file.readlines() + preinstances = [instance.strip().split("\t") for instance in preinstances] + preinstances = [[instance[0], instance[len(instance)-1]] for instance in preinstances] + instances = [] + for instance in preinstances: + if instance[0] != '--': + instances.append(instance) return data_folder, instances @classmethod From bd622878fceda57dc4cfe48426a1e59596c59d57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Fri, 19 Apr 2024 10:46:10 +0200 Subject: [PATCH 05/18] include noisy labelsets --- flair/datasets/sequence_labeling.py | 111 +++++++++++++++++++++------- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 50a0c2f2e6..f3f049b7f4 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4839,27 +4839,37 @@ def __init__( ) -class NoisyNER_EST_Clean(ColumnCorpus): +class NOISY_NER_EST(ColumnCorpus): data_url = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip" def __init__( self, - base_path: Optional[Union[str, Path]] = None, + version: int = 0, + base_path: Optional[Union[str, Path]] = None, column_format={0: "text", 1: "ner"}, in_memory: bool = True, **corpusargs, ) -> None: - data_folder, instances = self._load_data(base_path) - + assert version in range(0,8) + base_path = self._set_path(base_path) + features = self._load_features(base_path) + if version == 0: + preinstances = self._process_clean_labels(features) + else: + rdcd_features = self._rmv_clean_labels(features) + preinstances = self._process_noisy_labels(base_path, version, rdcd_features) + instances = self._delete_empty_labels(version, preinstances) train, dev, test = self._split_data(instances) - - self._write_instances(train, data_folder/"estner_clean_train.tsv") - self._write_instances(dev, data_folder/"estner_clean_dev.tsv") - self._write_instances(test, data_folder/"estner_clean_test.tsv") - - super().__init__( - data_folder, + self._write_instances(version, base_path, "train", train) + self._write_instances(version, base_path, "dev", dev) + self._write_instances(version, base_path, "test", test) + + super().__init__( + data_folder=base_path, + train_file=f"estner_noisy_labelset{version}_train.tsv", + dev_file=f"estner_noisy_labelset{version}_dev.tsv", + test_file=f"estner_noisy_labelset{version}_test.tsv", column_format=column_format, in_memory=in_memory, column_delimiter = '\t', @@ -4867,35 +4877,80 @@ def __init__( ) @classmethod - def _load_data(cls, base_path) -> tuple[Path, list[str]]: - base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) - data_folder = base_path/"estner_clean" - unpack_file(cached_path(cls.data_url, data_folder), data_folder, "zip", False) - with open(data_folder/"estner.cnll") as in_file: - preinstances = in_file.readlines() - preinstances = [instance.strip().split("\t") for instance in preinstances] - preinstances = [[instance[0], instance[len(instance)-1]] for instance in preinstances] + def _set_path(cls, base_path) -> Path: + if not base_path: + base_path = flair.cache_root/"datasets"/"estner_clean" + else: + base_path = Path(base_path) + return base_path + + @classmethod + def _load_features(cls, base_path) -> list[str]: + unpack_file(cached_path(cls.data_url, base_path), base_path, "zip", False) + with open(f"{base_path}/estner.cnll") as in_file: + prefeatures = in_file.readlines() + features = [feature.strip().split("\t") for feature in prefeatures] + return features + + @classmethod + def _process_clean_labels(cls, features) -> list[str]: + preinstances = [[instance[0], instance[len(instance)-1]] for instance in features] + return preinstances + + @classmethod + def _rmv_clean_labels(cls, features) -> list[str]: + rdcd_features = [feature[:-1] for feature in features] + return rdcd_features + + @classmethod + def _process_noisy_labels(cls, base_path, version, rdcd_features) -> list[str]: + label_file_path = f"{base_path}/NoisyNER_labelset{version}.labels" + try: + with open(label_file_path) as in_file: + labels = in_file.read().splitlines() + except FileNotFoundError: + raise Exception("") + instances = [] - for instance in preinstances: - if instance[0] != '--': + label_idx = 0 + for feature in rdcd_features: + if len(feature) == 0: + instances.append("") + else: + assert label_idx < len(labels) + instance = [feature[0], labels[label_idx]] instances.append(instance) - return data_folder, instances + label_idx += 1 + assert label_idx == len(labels) + return instances + @classmethod + def _delete_empty_labels(cls, version, preinstances) -> list[str]: + instances = [] + if version == 0: + for instance in preinstances: + if instance[0] != '--': + instances.append(instance) + else: + for instance in preinstances: + if instance != '--': + instances.append(instance) + return instances + @classmethod def _split_data(cls, instances) -> tuple[list[str], list[str], list[str]]: train = instances[:185708] dev = instances[185708:208922] test = instances[208922:] return train, dev, test - + @classmethod - def _write_instances(cls, instances, filepath): - # CoNLL format + def _write_instances(cls, version, base_path, split, data): column_separator = "\t" - with open(filepath, "w") as out_file: - for instance in instances: + with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w") as out_file: + for instance in data: out_file.write(column_separator.join(instance)) - out_file.write("\n") + out_file.write("\n") class MASAKHA_POS(MultiCorpus): From ca69b9b2d1c8fab6933b9d0b5aed9b1beca67ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:16:53 +0200 Subject: [PATCH 06/18] added Exceptions --- flair/datasets/sequence_labeling.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index f3f049b7f4..0a94f2c744 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4851,16 +4851,22 @@ def __init__( **corpusargs, ) -> None: - assert version in range(0,8) + if not (version in range(0,8)): + raise Exception("Please choose a version (int) from 0 to 7. With v0 (default) you get the clean labelset for the data, while v1 to v7 provide different kinds of noisy labelsets. For details see https://ojs.aaai.org/index.php/AAAI/article/view/16938.") + base_path = self._set_path(base_path) features = self._load_features(base_path) + if version == 0: preinstances = self._process_clean_labels(features) else: rdcd_features = self._rmv_clean_labels(features) preinstances = self._process_noisy_labels(base_path, version, rdcd_features) + instances = self._delete_empty_labels(version, preinstances) + train, dev, test = self._split_data(instances) + self._write_instances(version, base_path, "train", train) self._write_instances(version, base_path, "dev", dev) self._write_instances(version, base_path, "test", test) @@ -4909,7 +4915,7 @@ def _process_noisy_labels(cls, base_path, version, rdcd_features) -> list[str]: with open(label_file_path) as in_file: labels = in_file.read().splitlines() except FileNotFoundError: - raise Exception("") + raise Exception("Please download the noisy labelset file of your choice from https://github.com/uds-lsv/NoisyNER/tree/master/data/only_labels. Set the base_path argument to the path of the directory you saved the file in. Make sure, the version argument matches the file.") instances = [] label_idx = 0 @@ -4921,7 +4927,7 @@ def _process_noisy_labels(cls, base_path, version, rdcd_features) -> list[str]: instance = [feature[0], labels[label_idx]] instances.append(instance) label_idx += 1 - assert label_idx == len(labels) + assert label_idx == len(labels), "" return instances @classmethod @@ -4946,11 +4952,11 @@ def _split_data(cls, instances) -> tuple[list[str], list[str], list[str]]: @classmethod def _write_instances(cls, version, base_path, split, data): - column_separator = "\t" + column_separator = "\t" # CoNLL format with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w") as out_file: for instance in data: out_file.write(column_separator.join(instance)) - out_file.write("\n") + out_file.write("\n") class MASAKHA_POS(MultiCorpus): From 7da96abf8b06d2e33e5fc6636b231a208200ba71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:25:18 +0200 Subject: [PATCH 07/18] added NOISY_NER_EST dataset class --- flair/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 56c7e4dd0a..364bf5a215 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -218,6 +218,7 @@ NER_SWEDISH, NER_TURKU, NER_UKRAINIAN, + NOISY_NER_EST, ONTONOTES, UP_CHINESE, UP_ENGLISH, @@ -499,6 +500,7 @@ "NER_SWEDISH", "NER_TURKU", "NER_UKRAINIAN", + "NOISY_NER_EST", "UP_CHINESE", "UP_ENGLISH", "UP_FINNISH", From 728b68fe179b5456911edd8b40a3e000853e43fb Mon Sep 17 00:00:00 2001 From: elenamer Date: Tue, 28 May 2024 10:36:20 +0200 Subject: [PATCH 08/18] change type annotations (otherwise it doesn't work with python 3.8) --- flair/datasets/sequence_labeling.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 0a94f2c744..4d22eeef09 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4891,7 +4891,8 @@ def _set_path(cls, base_path) -> Path: return base_path @classmethod - def _load_features(cls, base_path) -> list[str]: + def _load_features(cls, base_path) -> List[str]: + print(base_path) unpack_file(cached_path(cls.data_url, base_path), base_path, "zip", False) with open(f"{base_path}/estner.cnll") as in_file: prefeatures = in_file.readlines() @@ -4899,17 +4900,17 @@ def _load_features(cls, base_path) -> list[str]: return features @classmethod - def _process_clean_labels(cls, features) -> list[str]: + def _process_clean_labels(cls, features) -> List[str]: preinstances = [[instance[0], instance[len(instance)-1]] for instance in features] return preinstances @classmethod - def _rmv_clean_labels(cls, features) -> list[str]: + def _rmv_clean_labels(cls, features) -> List[str]: rdcd_features = [feature[:-1] for feature in features] return rdcd_features @classmethod - def _process_noisy_labels(cls, base_path, version, rdcd_features) -> list[str]: + def _process_noisy_labels(cls, base_path, version, rdcd_features) -> List[str]: label_file_path = f"{base_path}/NoisyNER_labelset{version}.labels" try: with open(label_file_path) as in_file: @@ -4931,7 +4932,7 @@ def _process_noisy_labels(cls, base_path, version, rdcd_features) -> list[str]: return instances @classmethod - def _delete_empty_labels(cls, version, preinstances) -> list[str]: + def _delete_empty_labels(cls, version, preinstances) -> List[str]: instances = [] if version == 0: for instance in preinstances: @@ -4944,7 +4945,7 @@ def _delete_empty_labels(cls, version, preinstances) -> list[str]: return instances @classmethod - def _split_data(cls, instances) -> tuple[list[str], list[str], list[str]]: + def _split_data(cls, instances) -> Tuple[List[str], List[str], List[str]]: train = instances[:185708] dev = instances[185708:208922] test = instances[208922:] From b0fe0b45910c6579035882fc32f378966f7510b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Thu, 30 May 2024 18:30:02 +0200 Subject: [PATCH 09/18] add parameter descriptions and auto-download of labelsets --- flair/datasets/sequence_labeling.py | 84 +++++++++++++++++------------ 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 4d22eeef09..963995f7bd 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4841,51 +4841,63 @@ def __init__( class NOISY_NER_EST(ColumnCorpus): data_url = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip" + label_url = "https://raw.githubusercontent.com/uds-lsv/NoisyNER/master/data/only_labels" def __init__( self, version: int = 0, base_path: Optional[Union[str, Path]] = None, - column_format={0: "text", 1: "ner"}, - in_memory: bool = True, + in_memory: bool = True, **corpusargs, ) -> None: + """ + :param version: Chooses the labelset for the data. + v0 (default): clean labels + v1 to v7: different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938) + :param base_path: Default is None, meaning the corpus gets automatically downloaded and saved. + You can override this by passing a path to a directory containing the unprocessed files but typically this + should not be necessary. + :param in_memory: If True the dataset is kept in memory achieving speedups in training. + """ + + if not (version in range(0, 8)): + raise Exception( + "Please choose a version (int) from 0 to 7. With v0 (default) you get the clean labelset for the data, while v1 to v7 provide different kinds of noisy labelsets. For details see https://ojs.aaai.org/index.php/AAAI/article/view/16938." + ) - if not (version in range(0,8)): - raise Exception("Please choose a version (int) from 0 to 7. With v0 (default) you get the clean labelset for the data, while v1 to v7 provide different kinds of noisy labelsets. For details see https://ojs.aaai.org/index.php/AAAI/article/view/16938.") - base_path = self._set_path(base_path) features = self._load_features(base_path) - + if version == 0: preinstances = self._process_clean_labels(features) else: rdcd_features = self._rmv_clean_labels(features) - preinstances = self._process_noisy_labels(base_path, version, rdcd_features) - + labels = self._load_noisy_labels(version, base_path) + preinstances = self._process_noisy_labels(rdcd_features, labels) + instances = self._delete_empty_labels(version, preinstances) - + train, dev, test = self._split_data(instances) - + self._write_instances(version, base_path, "train", train) self._write_instances(version, base_path, "dev", dev) self._write_instances(version, base_path, "test", test) - super().__init__( - data_folder=base_path, - train_file=f"estner_noisy_labelset{version}_train.tsv", + super().__init__( + data_folder=base_path, + train_file=f"estner_noisy_labelset{version}_train.tsv", dev_file=f"estner_noisy_labelset{version}_dev.tsv", - test_file=f"estner_noisy_labelset{version}_test.tsv", - column_format=column_format, + test_file=f"estner_noisy_labelset{version}_test.tsv", + column_format={0: "text", 1: "ner"}, in_memory=in_memory, - column_delimiter = '\t', + column_delimiter="\t", **corpusargs, ) @classmethod def _set_path(cls, base_path) -> Path: if not base_path: - base_path = flair.cache_root/"datasets"/"estner_clean" + base_path = flair.cache_root / "datasets" / "estner" else: base_path = Path(base_path) return base_path @@ -4901,7 +4913,7 @@ def _load_features(cls, base_path) -> List[str]: @classmethod def _process_clean_labels(cls, features) -> List[str]: - preinstances = [[instance[0], instance[len(instance)-1]] for instance in features] + preinstances = [[instance[0], instance[len(instance) - 1]] for instance in features] return preinstances @classmethod @@ -4910,14 +4922,15 @@ def _rmv_clean_labels(cls, features) -> List[str]: return rdcd_features @classmethod - def _process_noisy_labels(cls, base_path, version, rdcd_features) -> List[str]: - label_file_path = f"{base_path}/NoisyNER_labelset{version}.labels" - try: - with open(label_file_path) as in_file: - labels = in_file.read().splitlines() - except FileNotFoundError: - raise Exception("Please download the noisy labelset file of your choice from https://github.com/uds-lsv/NoisyNER/tree/master/data/only_labels. Set the base_path argument to the path of the directory you saved the file in. Make sure, the version argument matches the file.") - + def _load_noisy_labels(cls, version, base_path) -> List[str]: + file_name = f"NoisyNER_labelset{version}.labels" + cached_path(f"{cls.label_url}/{file_name}", base_path) + with open(f"{base_path}/{file_name}") as in_file: + labels = in_file.read().splitlines() + return labels + + @classmethod + def _process_noisy_labels(cls, rdcd_features, labels) -> List[str]: instances = [] label_idx = 0 for feature in rdcd_features: @@ -4928,7 +4941,7 @@ def _process_noisy_labels(cls, base_path, version, rdcd_features) -> List[str]: instance = [feature[0], labels[label_idx]] instances.append(instance) label_idx += 1 - assert label_idx == len(labels), "" + assert label_idx == len(labels), "" return instances @classmethod @@ -4936,28 +4949,29 @@ def _delete_empty_labels(cls, version, preinstances) -> List[str]: instances = [] if version == 0: for instance in preinstances: - if instance[0] != '--': + if instance[0] != "--": instances.append(instance) else: for instance in preinstances: - if instance != '--': + if instance != "--": instances.append(instance) - return instances - + return instances + @classmethod - def _split_data(cls, instances) -> Tuple[List[str], List[str], List[str]]: + def _split_data(cls, instances) -> Tuple[List[str], List[str], List[str]]: train = instances[:185708] dev = instances[185708:208922] test = instances[208922:] return train, dev, test - + @classmethod def _write_instances(cls, version, base_path, split, data): - column_separator = "\t" # CoNLL format + column_separator = "\t" # CoNLL format with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w") as out_file: for instance in data: out_file.write(column_separator.join(instance)) - out_file.write("\n") + out_file.write("\n") + class MASAKHA_POS(MultiCorpus): From d56166012a9433f63cddd42775d1e9bc7413f639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Thu, 30 May 2024 19:14:47 +0200 Subject: [PATCH 10/18] code formatting --- flair/datasets/sequence_labeling.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 963995f7bd..2b2dda115a 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4850,7 +4850,8 @@ def __init__( in_memory: bool = True, **corpusargs, ) -> None: - """ + """Initialize the NoisyNER corpus. + :param version: Chooses the labelset for the data. v0 (default): clean labels v1 to v7: different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938) @@ -4859,8 +4860,7 @@ def __init__( should not be necessary. :param in_memory: If True the dataset is kept in memory achieving speedups in training. """ - - if not (version in range(0, 8)): + if version not in range(8): raise Exception( "Please choose a version (int) from 0 to 7. With v0 (default) you get the clean labelset for the data, while v1 to v7 provide different kinds of noisy labelsets. For details see https://ojs.aaai.org/index.php/AAAI/article/view/16938." ) @@ -4896,10 +4896,7 @@ def __init__( @classmethod def _set_path(cls, base_path) -> Path: - if not base_path: - base_path = flair.cache_root / "datasets" / "estner" - else: - base_path = Path(base_path) + base_path = flair.cache_root / "datasets" / "estner" if not base_path else base_path = Path(base_path) return base_path @classmethod @@ -4971,8 +4968,7 @@ def _write_instances(cls, version, base_path, split, data): for instance in data: out_file.write(column_separator.join(instance)) out_file.write("\n") - - + class MASAKHA_POS(MultiCorpus): def __init__( From d7024d62e1467e7d47d9db2cc52111c8f0a50d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Thu, 30 May 2024 22:15:23 +0200 Subject: [PATCH 11/18] syntax fix --- flair/datasets/sequence_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 2b2dda115a..f5613ea863 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4896,7 +4896,7 @@ def __init__( @classmethod def _set_path(cls, base_path) -> Path: - base_path = flair.cache_root / "datasets" / "estner" if not base_path else base_path = Path(base_path) + base_path = flair.cache_root/"datasets"/"estner" if not base_path else Path(base_path) return base_path @classmethod From bd0f733734b98c0a0e9273035af6fb109eb5c03e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Fri, 31 May 2024 10:26:22 +0200 Subject: [PATCH 12/18] code formatting --- flair/datasets/sequence_labeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index f5613ea863..ce5c9e0331 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4851,7 +4851,7 @@ def __init__( **corpusargs, ) -> None: """Initialize the NoisyNER corpus. - + :param version: Chooses the labelset for the data. v0 (default): clean labels v1 to v7: different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938) @@ -4896,7 +4896,7 @@ def __init__( @classmethod def _set_path(cls, base_path) -> Path: - base_path = flair.cache_root/"datasets"/"estner" if not base_path else Path(base_path) + base_path = flair.cache_root / "datasets" / "estner" if not base_path else Path(base_path) return base_path @classmethod @@ -4968,7 +4968,7 @@ def _write_instances(cls, version, base_path, split, data): for instance in data: out_file.write(column_separator.join(instance)) out_file.write("\n") - + class MASAKHA_POS(MultiCorpus): def __init__( From dac9c5c67c9c28d24e3a930419a1a3ff73d89441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:34:14 +0200 Subject: [PATCH 13/18] updated type annotations --- flair/datasets/sequence_labeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index ce5c9e0331..7eb72a2eaf 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4900,7 +4900,7 @@ def _set_path(cls, base_path) -> Path: return base_path @classmethod - def _load_features(cls, base_path) -> List[str]: + def _load_features(cls, base_path) -> List[List[str]]: print(base_path) unpack_file(cached_path(cls.data_url, base_path), base_path, "zip", False) with open(f"{base_path}/estner.cnll") as in_file: @@ -4909,7 +4909,7 @@ def _load_features(cls, base_path) -> List[str]: return features @classmethod - def _process_clean_labels(cls, features) -> List[str]: + def _process_clean_labels(cls, features) -> List[List[str]]: preinstances = [[instance[0], instance[len(instance) - 1]] for instance in features] return preinstances From a2b3da6d27476a53418ce094d18f1e4f25de50eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:58:59 +0200 Subject: [PATCH 14/18] type formatting --- flair/datasets/sequence_labeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 7eb72a2eaf..26a21a0087 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4927,12 +4927,12 @@ def _load_noisy_labels(cls, version, base_path) -> List[str]: return labels @classmethod - def _process_noisy_labels(cls, rdcd_features, labels) -> List[str]: + def _process_noisy_labels(cls, rdcd_features, labels) -> List[List[str]]: instances = [] label_idx = 0 for feature in rdcd_features: if len(feature) == 0: - instances.append("") + instances.append([""]) else: assert label_idx < len(labels) instance = [feature[0], labels[label_idx]] From 1291dff103f0c3b4aa894430713cc7117295c05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:19:25 +0200 Subject: [PATCH 15/18] specify file encoding --- flair/datasets/sequence_labeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 26a21a0087..136c33f2f2 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4903,7 +4903,7 @@ def _set_path(cls, base_path) -> Path: def _load_features(cls, base_path) -> List[List[str]]: print(base_path) unpack_file(cached_path(cls.data_url, base_path), base_path, "zip", False) - with open(f"{base_path}/estner.cnll") as in_file: + with open(f"{base_path}/estner.cnll", encoding="utf-8") as in_file: prefeatures = in_file.readlines() features = [feature.strip().split("\t") for feature in prefeatures] return features @@ -4922,7 +4922,7 @@ def _rmv_clean_labels(cls, features) -> List[str]: def _load_noisy_labels(cls, version, base_path) -> List[str]: file_name = f"NoisyNER_labelset{version}.labels" cached_path(f"{cls.label_url}/{file_name}", base_path) - with open(f"{base_path}/{file_name}") as in_file: + with open(f"{base_path}/{file_name}", encoding="utf-8") as in_file: labels = in_file.read().splitlines() return labels @@ -4964,7 +4964,7 @@ def _split_data(cls, instances) -> Tuple[List[str], List[str], List[str]]: @classmethod def _write_instances(cls, version, base_path, split, data): column_separator = "\t" # CoNLL format - with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w") as out_file: + with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w", encoding="utf-8") as out_file: for instance in data: out_file.write(column_separator.join(instance)) out_file.write("\n") From 77118bf71f4f40f4aa2c673e0d3b41a1161d61b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Teresa=20L=C3=B6ffelhardt?= <132658673+teresaloeffelhardt@users.noreply.github.com> Date: Mon, 8 Jul 2024 11:11:59 +0200 Subject: [PATCH 16/18] modify param descriptions to match google docstring format --- flair/datasets/sequence_labeling.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 136c33f2f2..672f3aae42 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4852,13 +4852,16 @@ def __init__( ) -> None: """Initialize the NoisyNER corpus. - :param version: Chooses the labelset for the data. - v0 (default): clean labels - v1 to v7: different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938) - :param base_path: Default is None, meaning the corpus gets automatically downloaded and saved. - You can override this by passing a path to a directory containing the unprocessed files but typically this - should not be necessary. - :param in_memory: If True the dataset is kept in memory achieving speedups in training. + Args: + version (int): Chooses the labelset for the data. + v0 (default): Clean labels + v1 to v7: Different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938) + base_path (Optional[Union[str, Path]]): Path to the data. + Default is None, meaning the corpus gets automatically downloaded and saved. + You can override this by passing a path to a directory containing the unprocessed files but typically this + should not be necessary. + in_memory (bool): If True the dataset is kept in memory achieving speedups in training. + **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. """ if version not in range(8): raise Exception( From d77d61da49fcaeb30e817776d646ecb0acdbb0f1 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Mon, 15 Jul 2024 16:07:19 +0200 Subject: [PATCH 17/18] Rename to make consistent with other dataset names --- flair/datasets/__init__.py | 4 ++-- flair/datasets/sequence_labeling.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index d1b916483d..5eb7cfcab2 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -218,7 +218,7 @@ NER_SWEDISH, NER_TURKU, NER_UKRAINIAN, - NOISY_NER_EST, + NER_ESTONIAN_NOISY, ONTONOTES, UP_CHINESE, UP_ENGLISH, @@ -506,7 +506,7 @@ "NER_SWEDISH", "NER_TURKU", "NER_UKRAINIAN", - "NOISY_NER_EST", + "NER_ESTONIAN_NOISY", "UP_CHINESE", "UP_ENGLISH", "UP_FINNISH", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 672f3aae42..38ca75e94b 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4839,7 +4839,7 @@ def __init__( ) -class NOISY_NER_EST(ColumnCorpus): +class NER_ESTONIAN_NOISY(ColumnCorpus): data_url = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip" label_url = "https://raw.githubusercontent.com/uds-lsv/NoisyNER/master/data/only_labels" From 4556084420ae238e79ea1df6ef7a6698be72ae9c Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Mon, 15 Jul 2024 21:20:33 +0200 Subject: [PATCH 18/18] Ruff fixes --- flair/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5eb7cfcab2..2837e017c0 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -196,6 +196,7 @@ NER_ENGLISH_WEBPAGES, NER_ENGLISH_WIKIGOLD, NER_ENGLISH_WNUT_2020, + NER_ESTONIAN_NOISY, NER_FINNISH, NER_GERMAN_BIOFID, NER_GERMAN_EUROPARL, @@ -218,7 +219,6 @@ NER_SWEDISH, NER_TURKU, NER_UKRAINIAN, - NER_ESTONIAN_NOISY, ONTONOTES, UP_CHINESE, UP_ENGLISH,