This repository has been archived on 2023-06-02. You can view files and clone it, but cannot push or open issues or pull requests.
mltraining/trainers.py
Harrison d6c0130477 Initial commit
Moved training code from 'avarias'
2023-05-29 15:45:37 -05:00

122 lines
4.2 KiB
Python

import numpy as np
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorForTokenClassification,
Trainer,
TrainingArguments,
EvalPrediction,
)
from typing import Union
from datasets import load_dataset
from datasets.dataset_dict import (
DatasetDict,
Dataset,
IterableDatasetDict,
)
from datasets.iterable_dataset import IterableDataset
import evaluate
class TokenClassificationTrainer:
def __init__(
self,
model: str,
dataset: str,
labels_name: str = "labels",
evaluator: str = "seqeval",
) -> None:
self._dataset: Union[
DatasetDict, Dataset, IterableDatasetDict, IterableDataset
] = load_dataset(dataset)
self._labels: list[str] = (
self._dataset["train"].features[labels_name].feature.names
) # type: ignore
self._id_to_label: dict[int, str] = {}
self._label_to_id: dict[str, int] = {}
for id, label in enumerate(self._labels):
self._id_to_label[id] = label
self._label_to_id[label] = id
self._model = AutoModelForSequenceClassification.from_pretrained(
model,
num_labels=len(self._labels),
id2label=self._id_to_label,
label2id=self._label_to_id,
)
self._tokenizer = AutoTokenizer.from_pretrained(model)
self._data_collator = DataCollatorForTokenClassification(
tokenizer=self._tokenizer
)
self._evaluator = evaluate.load(evaluator)
def tokenize_and_align_labels(self, examples):
# Straight from
# https://huggingface.co/docs/transformers/tasks/token_classification
tokenized_inputs = self._tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(
batch_index=i
) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif (
word_idx != previous_word_idx
): # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
def tokenize_and_align_labels_over_dataset(self):
return self._dataset.map(self.tokenize_and_align_labels, batched=True)
def compute_metrics(
self, evaluation_prediction: EvalPrediction
) -> dict[str, float]:
predictions, expectations = evaluation_prediction
predictions = np.argmax(predictions, axis=2)
true_predictions = [
[self._labels[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, expectations)
]
true_labels = [
[self._labels[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, expectations)
]
results: dict[str, float] = self._evaluator.compute(
predictions=true_predictions, references=true_labels
) # type: ignore
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
def train(self, output_dir: str, **arguments):
trainer = Trainer(
args=TrainingArguments(output_dir=output_dir, **arguments),
train_dataset=self._dataset["train"], # type: ignore
eval_dataset=self._dataset["test"], # type: ignore
tokenizer=self._tokenizer,
data_collator=self._data_collator,
compute_metrics=self.compute_metrics,
)
trainer.train()