import numpy as np from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments, EvalPrediction, ) from typing import Union from datasets import load_dataset from datasets.dataset_dict import ( DatasetDict, Dataset, IterableDatasetDict, ) from datasets.iterable_dataset import IterableDataset import evaluate class TokenClassificationTrainer: def __init__( self, model: str, dataset: str, labels_name: str = "labels", evaluator: str = "seqeval", ) -> None: self._dataset: Union[ DatasetDict, Dataset, IterableDatasetDict, IterableDataset ] = load_dataset(dataset) self._labels: list[str] = ( self._dataset["train"].features[labels_name].feature.names ) # type: ignore self._id_to_label: dict[int, str] = {} self._label_to_id: dict[str, int] = {} for id, label in enumerate(self._labels): self._id_to_label[id] = label self._label_to_id[label] = id self._model = AutoModelForSequenceClassification.from_pretrained( model, num_labels=len(self._labels), id2label=self._id_to_label, label2id=self._label_to_id, ) self._tokenizer = AutoTokenizer.from_pretrained(model) self._data_collator = DataCollatorForTokenClassification( tokenizer=self._tokenizer ) self._evaluator = evaluate.load(evaluator) def tokenize_and_align_labels(self, examples): # Straight from # https://huggingface.co/docs/transformers/tasks/token_classification tokenized_inputs = self._tokenizer( examples["tokens"], truncation=True, is_split_into_words=True ) labels = [] for i, label in enumerate(examples[f"ner_tags"]): word_ids = tokenized_inputs.word_ids( batch_index=i ) # Map tokens to their respective word. previous_word_idx = None label_ids = [] for word_idx in word_ids: # Set the special tokens to -100. if word_idx is None: label_ids.append(-100) elif ( word_idx != previous_word_idx ): # Only label the first token of a given word. label_ids.append(label[word_idx]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs def tokenize_and_align_labels_over_dataset(self): return self._dataset.map(self.tokenize_and_align_labels, batched=True) def compute_metrics( self, evaluation_prediction: EvalPrediction ) -> dict[str, float]: predictions, expectations = evaluation_prediction predictions = np.argmax(predictions, axis=2) true_predictions = [ [self._labels[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, expectations) ] true_labels = [ [self._labels[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, expectations) ] results: dict[str, float] = self._evaluator.compute( predictions=true_predictions, references=true_labels ) # type: ignore return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } def train(self, output_dir: str, **arguments): trainer = Trainer( args=TrainingArguments(output_dir=output_dir, **arguments), train_dataset=self._dataset["train"], # type: ignore eval_dataset=self._dataset["test"], # type: ignore tokenizer=self._tokenizer, data_collator=self._data_collator, compute_metrics=self.compute_metrics, ) trainer.train()