## Experiment exp020-4
GBERT-Large, Batch Size 32, Warmup Steps 200, Learning Rate 2e-5

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    BertForTokenClassification,

)
import torch
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

2025-06-25 23:36:01.712205: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-25 23:36:01.739052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750887361.774014 1226991 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750887361.784804 1226991 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750887361.814945 1226991 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
class SpanClassifierWithStrictF1:
    def __init__(self, model_name="deepset/gbert-base"):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.labels =[
            "O",
            "B-positive feedback", "B-compliment", "B-affection declaration", "B-encouragement", "B-gratitude", "B-agreement", "B-ambiguous", "B-implicit", "B-group membership", "B-sympathy",
            "I-positive feedback", "I-compliment", "I-affection declaration", "I-encouragement", "I-gratitude", "I-agreement", "I-ambiguous", "I-implicit", "I-group membership", "I-sympathy"
        ]
        self.label2id = {label: i for i, label in enumerate(self.labels)}
        self.id2label = {i: label for i, label in enumerate(self.labels)}

    def create_dataset(self, comments_df, spans_df):
        """Erstelle Dataset mit BIO-Labels und speichere Evaluation-Daten"""
        examples = []
        eval_data = []  # Für Strict F1 Berechnung

        spans_grouped = spans_df.groupby(['document', 'comment_id'])

        for _, row in comments_df.iterrows():
            text = row['comment']
            document = row['document']
            comment_id = row['comment_id']
            key = (document, comment_id)

            # True spans für diesen Kommentar
            if key in spans_grouped.groups:
                true_spans = [(span_type, int(start), int(end))
                              for span_type, start, end in
                              spans_grouped.get_group(key)[['type', 'start', 'end']].values]
            else:
                true_spans = []

            # Tokenisierung
            tokenized = self.tokenizer(text, truncation=True, max_length=512,
                                       return_offsets_mapping=True)

            # BIO-Labels erstellen
            labels = self._create_bio_labels(tokenized['offset_mapping'],
                                             spans_grouped.get_group(key)[['start', 'end', 'type']].values
                                             if key in spans_grouped.groups else [])

            examples.append({
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask'],
                'labels': labels
            })

            # Evaluation-Daten speichern
            eval_data.append({
                'text': text,
                'offset_mapping': tokenized['offset_mapping'],
                'true_spans': true_spans,
                'document': document,
                'comment_id': comment_id
            })

        return examples, eval_data

    def _create_bio_labels(self, offset_mapping, spans):
        """Erstelle BIO-Labels für Tokens"""
        labels = [0] * len(offset_mapping)  # 0 = "O"

        for start, end, type_label in spans:
            for i, (token_start, token_end) in enumerate(offset_mapping):
                if token_start is None:  # Spezielle Tokens
                    continue

                # Token überlappt mit Span
                if token_start < end and token_end > start:
                    if token_start <= start:
                        labels[i] = self.label2id[f'B-{type_label}'] # B-compliment
                    else:
                        labels[i] = self.label2id[f'I-{type_label}'] # I-compliment

        return labels

    def compute_metrics(self, eval_pred):
        """Berechne Strict F1 für Trainer"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=2)

        # Konvertiere Vorhersagen zu Spans
        batch_pred_spans = []
        batch_true_spans = []

        for i, (pred_seq, label_seq) in enumerate(zip(predictions, labels)):
            # Evaluation-Daten für dieses Beispiel
            if i < len(self.current_eval_data):
                eval_item = self.current_eval_data[i]
                text = eval_item['text']
                offset_mapping = eval_item['offset_mapping']
                true_spans = eval_item['true_spans']

                # Filtere gültige Vorhersagen (keine Padding-Tokens)
                valid_predictions = []
                valid_offsets = []

                for j, (pred_label, true_label) in enumerate(zip(pred_seq, label_seq)):
                    if true_label != -100 and j < len(offset_mapping):
                        valid_predictions.append(pred_label)
                        valid_offsets.append(offset_mapping[j])

                # Konvertiere zu Spans
                pred_spans = self._predictions_to_spans(valid_predictions, valid_offsets, text)
                pred_spans_tuples = [(span['type'], span['start'], span['end']) for span in pred_spans]

                batch_pred_spans.append(pred_spans_tuples)
                batch_true_spans.append(true_spans)

        # Berechne Strict F1
        strict_f1, strict_precision, strict_recall, tp, fp, fn = self._calculate_strict_f1(
            batch_true_spans, batch_pred_spans
        )

        torch.cuda.memory.empty_cache()

        return {
            "strict_f1": torch.tensor(strict_f1),
            "strict_precision": torch.tensor(strict_precision),
            "strict_recall": torch.tensor(strict_recall),
            "true_positives": torch.tensor(tp),
            "false_positives": torch.tensor(fp),
            "false_negatives": torch.tensor(fn)
        }

    def _calculate_strict_f1(self, true_spans_list, pred_spans_list):
        """Berechne Strict F1 über alle Kommentare"""
        tp, fp, fn = 0, 0, 0

        for true_spans, pred_spans in zip(true_spans_list, pred_spans_list):
            # Finde exakte Matches (Typ und Span müssen übereinstimmen)
            matches = self._find_exact_matches(true_spans, pred_spans)

            tp += len(matches)
            fp += len(pred_spans) - len(matches)
            fn += len(true_spans) - len(matches)

        # Berechne Metriken
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        return f1, precision, recall, tp, fp, fn

    def _find_exact_matches(self, true_spans, pred_spans):
        """Finde exakte Matches zwischen True und Predicted Spans"""
        matches = []
        used_pred = set()

        for true_span in true_spans:
            for i, pred_span in enumerate(pred_spans):
                if i not in used_pred and true_span == pred_span:
                    matches.append((true_span, pred_span))
                    used_pred.add(i)
                    break

        return matches

    def _predictions_to_spans(self, predicted_labels, offset_mapping, text):
        """Konvertiere Token-Vorhersagen zu Spans"""
        spans = []
        current_span = None

        for i, label_id in enumerate(predicted_labels):
            if i >= len(offset_mapping):
                break

            label = self.id2label[label_id]
            token_start, token_end = offset_mapping[i]

            if token_start is None:
                continue

            if label.startswith('B-'):
                if current_span:
                    spans.append(current_span)
                current_span = {
                    'type': label[2:],
                    'start': token_start,
                    'end': token_end,
                    'text': text[token_start:token_end]
                }
            elif label.startswith('I-') and current_span:
                current_span['end'] = token_end
                current_span['text'] = text[current_span['start']:current_span['end']]
            else:
                if current_span:
                    spans.append(current_span)
                    current_span = None

        if current_span:
            spans.append(current_span)

        return spans

    def predict(self, texts):
        """Vorhersage für neue Texte"""
        if not hasattr(self, 'model'):
            raise ValueError("Modell muss erst trainiert werden!")

        predictions = []
        device = next(self.model.parameters()).device

        for text in texts:
            # Tokenisierung
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True,
                                    max_length=512, return_offsets_mapping=True)

            offset_mapping = inputs.pop('offset_mapping')
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Vorhersage
            with torch.no_grad():
                outputs = self.model(**inputs)

            predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()

            # Spans extrahieren
            spans = self._predictions_to_spans(predicted_labels, offset_mapping[0], text)
            predictions.append({'text': text, 'spans': spans})

        return predictions

    def evaluate_strict_f1(self, comments_df, spans_df):
        """Evaluiere Strict F1 auf Test-Daten"""
        if not hasattr(self, 'model'):
            raise ValueError("Modell muss erst trainiert werden!")

        print("Evaluiere Strict F1...")

        # Vorhersagen für alle Kommentare
        texts = comments_df['comment'].tolist()
        predictions = self.predict(texts)

        # Organisiere True Spans
        spans_grouped = spans_df.groupby(['document', 'comment_id'])
        true_spans_dict = {}
        pred_spans_dict = {}

        for i, (_, row) in enumerate(comments_df.iterrows()):
            key = (row['document'], row['comment_id'])

            # True spans
            if key in spans_grouped.groups:
                true_spans = [(span_type, int(start), int(end))
                              for span_type, start, end in
                              spans_grouped.get_group(key)[['type', 'start', 'end']].values]
            else:
                true_spans = []

            # Predicted spans
            pred_spans = [(span['type'], span['start'], span['end'])
                          for span in predictions[i]['spans']]

            true_spans_dict[key] = true_spans
            pred_spans_dict[key] = pred_spans

        # Berechne Strict F1
        all_true_spans = list(true_spans_dict.values())
        all_pred_spans = list(pred_spans_dict.values())

        f1, precision, recall, tp, fp, fn = self._calculate_strict_f1(all_true_spans, all_pred_spans)

        print(f"\nStrict F1 Ergebnisse:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-Score:  {f1:.4f}")
        print(f"True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}")

        return {
            'strict_f1': f1,
            'strict_precision': precision,
            'strict_recall': recall,
            'true_positives': tp,
            'false_positives': fp,
            'false_negatives': fn
        }

def convert_spans(row):
    spans = row['predicted_spans']
    document = row['document']
    comment_id = row['comment_id']
    return [{'document': document, 'comment_id': comment_id, 'type': span['type'], 'start': span['start'], 'end': span['end']} for span in spans]

def pred_to_spans(row):
    predicted_labels, offset_mapping, text = row['predicted_labels'], row['offset_mapping'], row['comment']
    return [classifier._predictions_to_spans(predicted_labels, offset_mapping, text)]

In [3]:
comments: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/comments.csv")
task1: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/task1.csv")
task2: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/task2.csv")
comments = comments.merge(task1, on=["document", "comment_id"])
spans_grouped = task2.groupby(['document', 'comment_id'])

test_data: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/test data/comments.csv")

# check every comment that contain spans if they have overlapping start and end positions
task2['overlap'] = False
overlapping_spans = task2.groupby(['document', 'comment_id'])
for (doc, comment), group in overlapping_spans:
    if len(group) > 1:
        starts = group['start'].tolist()
        ends = group['end'].tolist()
        for i in range(len(starts)):
            for j in range(i + 1, len(starts)):
                if not (ends[i] <= starts[j] or ends[j] <= starts[i]):
                    task2.loc[(task2['document'] == doc) & (task2['comment_id'] == comment), 'overlap'] = True

task2 = task2[task2['overlap'] == False].drop(columns=['overlap'])

In [4]:
from multiset import *
ALL_LABELS = ["affection declaration","agreement","ambiguous",
              "compliment","encouragement","gratitude","group membership",
              "implicit","positive feedback","sympathy"]

def fine_grained_flausch_by_label(gold, predicted):
    gold['cid']= gold['document']+"_"+gold['comment_id'].apply(str)
    predicted['cid']= predicted['document']+"_"+predicted['comment_id'].apply(str)

    # annotation sets (predicted)
    pred_spans = Multiset()
    pred_spans_loose = Multiset()
    pred_types = Multiset()

    # annotation sets (gold)
    gold_spans = Multiset()
    gold_spans_loose = Multiset()
    gold_types = Multiset()

    for row in predicted.itertuples(index=False):
        pred_spans.add((row.cid,row.type,row.start,row.end))
        pred_spans_loose.add((row.cid,row.start,row.end))
        pred_types.add((row.cid,row.type))
    for row in gold.itertuples(index=False):
        gold_spans.add((row.cid,row.type,row.start,row.end))
        gold_spans_loose.add((row.cid,row.start,row.end))
        gold_types.add((row.cid,row.type))

    # precision = true_pos / true_pos + false_pos
    # recall = true_pos / true_pos + false_neg
    # f_1 = 2 * prec * rec / (prec + rec)

    results = {'TOTAL': {'STRICT': {},'SPANS': {},'TYPES': {}}}
    # label-wise evaluation (only for strict and type)
    for label in ALL_LABELS:
        results[label] = {'STRICT': {},'TYPES': {}}
        gold_spans_x = set(filter(lambda x: x[1].__eq__(label), gold_spans))
        pred_spans_x = set(filter(lambda x: x[1].__eq__(label), pred_spans))
        gold_types_x = set(filter(lambda x: x[1].__eq__(label), gold_types))
        pred_types_x = set(filter(lambda x: x[1].__eq__(label), pred_types))

        # strict: spans + type must match
        ### NOTE: x and y / x returns 0 if x = 0 and y/x otherwise (test for zero division)
        strict_p = float(len(pred_spans_x)) and float( len(gold_spans_x.intersection(pred_spans_x))) / len(pred_spans_x)
        strict_r = float(len(gold_spans_x)) and float( len(gold_spans_x.intersection(pred_spans_x))) / len(gold_spans_x)
        strict_f = (strict_p + strict_r) and 2 * strict_p * strict_r / (strict_p + strict_r)
        results[label]['STRICT']['prec'] = strict_p
        results[label]['STRICT']['rec'] = strict_r
        results[label]['STRICT']['f1'] = strict_f

        # detection mode: only types must match (per post)
        types_p = float(len(pred_types_x)) and float( len(gold_types_x.intersection(pred_types_x))) / len(pred_types_x)
        types_r = float(len(gold_types_x)) and float( len(gold_types_x.intersection(pred_types_x))) / len(gold_types_x)
        types_f = (types_p + types_r) and 2 * types_p * types_r / (types_p + types_r)
        results[label]['TYPES']['prec'] = types_p
        results[label]['TYPES']['rec'] = types_r
        results[label]['TYPES']['f1'] = types_f

    # Overall evaluation
    # strict: spans + type must match
    strict_p = float(len(pred_spans)) and float( len(gold_spans.intersection(pred_spans))) / len(pred_spans)
    strict_r = float(len(gold_spans)) and float( len(gold_spans.intersection(pred_spans))) / len(gold_spans)
    strict_f = (strict_p + strict_r) and 2 * strict_p * strict_r / (strict_p + strict_r)
    results['TOTAL']['STRICT']['prec'] = strict_p
    results['TOTAL']['STRICT']['rec'] = strict_r
    results['TOTAL']['STRICT']['f1'] = strict_f

    # spans: spans must match
    spans_p = float(len(pred_spans_loose)) and float( len(gold_spans_loose.intersection(pred_spans_loose))) / len(pred_spans_loose)
    spans_r = float(len(gold_spans_loose)) and float( len(gold_spans_loose.intersection(pred_spans_loose))) / len(gold_spans_loose)
    spans_f = (spans_p + spans_r) and 2 * spans_p * spans_r / (spans_p + spans_r)
    results['TOTAL']['SPANS']['prec'] = spans_p
    results['TOTAL']['SPANS']['rec'] = spans_r
    results['TOTAL']['SPANS']['f1'] = spans_f

    # detection mode: only types must match (per post)
    types_p = float(len(pred_types)) and float( len(gold_types.intersection(pred_types))) / len(pred_types)
    types_r = float(len(gold_types)) and float( len(gold_types.intersection(pred_types))) / len(gold_types)
    types_f = (types_p + types_r) and 2 * types_p * types_r / (types_p + types_r)
    results['TOTAL']['TYPES']['prec'] = types_p
    results['TOTAL']['TYPES']['rec'] = types_r
    results['TOTAL']['TYPES']['f1'] = types_f

#    print("STRICT:\n ",strict_p,strict_r,strict_f)
#    print("SPANS:\n ",spans_p,spans_r,spans_f)
#    print("TYPES:\n ",types_p,types_r,types_f)
    return(results)

In [5]:
classifier = SpanClassifierWithStrictF1('deepset/gbert-large')

In [6]:
# Dataset neu erstellen für diesen Fold
examples, eval_data = classifier.create_dataset(comments, task2)
train_examples, val_examples = train_test_split(examples, test_size=0.1, random_state=42)

# Evaluation-Daten entsprechend aufteilen
train_indices, val_indices = train_test_split(range(len(examples)), test_size=0.1, random_state=42)

In [8]:
classifier.model = BertForTokenClassification.from_pretrained(
    'deepset/gbert-large',
    num_labels=len(classifier.labels),
    id2label=classifier.id2label,
    label2id=classifier.label2id
)
classifier.model.load_state_dict(torch.load('./experiments/exp020/exp020-4-final3_model.pth'))
classifier.model.eval()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [9]:
len(comments.iloc[val_indices].comment.tolist())

3706

In [10]:
comments_val = comments.iloc[val_indices].copy()
comments_val.reset_index(drop=True, inplace=True)

In [11]:
test_comments = comments_val.copy()

comments_val['gold_spans'] = None
comments_val['predicted_labels'] = None
comments_val['predicted_probs'] = None
comments_val['offset_mapping'] = None
comments_val['text_tokens'] = None

for idx in range(len(comments_val)): #range(15):
    row = comments_val.iloc[idx]
    text = row['comment']
    key = (row['document'], row['comment_id'])

    text_tokens = classifier.tokenizer.tokenize(text)
    comments_val.at[idx, 'text_tokens'] = text_tokens

    device = next(classifier.model.parameters()).device
    inputs = classifier.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)

    offset_mapping = inputs.pop('offset_mapping')
    comments_val.at[idx, 'offset_mapping'] = offset_mapping.cpu().numpy()[0].tolist()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Vorhersage
    with torch.no_grad():
        outputs = classifier.model(**inputs)

    predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    predicted_probs = torch.nn.functional.softmax(outputs.logits, dim=2)[0].cpu().numpy()
    comments_val.at[idx, 'predicted_labels'] = predicted_labels
    comments_val.at[idx, 'predicted_probs'] = predicted_probs

    if key not in spans_grouped.groups:
        comments_val.at[idx, 'gold_spans'] = []
        pass
    else:
        spans = spans_grouped.get_group(key).to_dict(orient='records')
        comments_val.at[idx, 'gold_spans'] = spans

# or simply predict like this witout probabilities:
# val_set_predictions = classifier.predict(comments_val.comment.tolist())

In [12]:
comments_val['predicted_spans'] = comments_val.apply(pred_to_spans, axis=1, result_type='expand')

test_gold_spans = pd.DataFrame((comments_val['gold_spans'].explode().dropna().tolist()))
test_baseline_spans = pd.DataFrame(comments_val.apply(convert_spans, axis=1).explode().dropna().tolist())
print(f"F1 on ES data before postprocessing {fine_grained_flausch_by_label(test_gold_spans, test_baseline_spans)['TOTAL']['STRICT']}")

F1 on ES data before postprocessing {'prec': 0.7572335920959774, 'rec': 0.7073170731707317, 'f1': 0.7314246762099522}


In [13]:
def build_spans_from_classification(tokens, classification, offset_mapping):
    """Modified version to work with string tokens and offset mapping"""
    res = []
    searching_end = False
    temp_res = []
    trunc_count = 0
    skip_count = 0

    for i, el in enumerate(classification):
        # Skip special tokens like CLS, SEP
        if i >= len(offset_mapping) or offset_mapping[i][0] is None:
            continue

        token_start, token_end = offset_mapping[i]

        if el == 'O' and searching_end is True:
            if i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            if i > 0:
                prev_end = offset_mapping[i-1][1]
                temp_res[1] = prev_end
            else:
                temp_res[1] = -1
            res.append(temp_res)
            searching_end = False

        elif el.startswith('B-'):
            if i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            if searching_end is True:
                if i > 0:
                    prev_end = offset_mapping[i-1][1]
                    temp_res[1] = prev_end
                else:
                    temp_res[1] = -1
                res.append(temp_res)
                trunc_count += 1
            split = el.split('-', 1)
            label_type = split[1]
            temp_res = [token_start, -1, label_type, ""]  # Changed structure
            searching_end = True

        elif el.startswith('I-'):
            if searching_end is True and i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            split = el.split('-', 1)
            label_type = split[1]
            if searching_end is True and label_type != temp_res[2]:
                if i > 0:
                    prev_end = offset_mapping[i-1][1]
                    temp_res[1] = prev_end
                else:
                    temp_res[1] = -1
                res.append(temp_res)
                searching_end = False
                trunc_count += 1
            elif searching_end is False:
                skip_count += 1

    if searching_end is True and len(offset_mapping) > 1:
        temp_res[1] = offset_mapping[-1][1]
        res.append(temp_res)

    return res, skip_count, trunc_count

def apply_span_classification(row):
    """Apply classification to the tokens and return spans."""
    tokens = ['[CLS]', *row['text_tokens'], '[SEP]']
    classification = row['predicted_labels']
    offset_mapping = row['offset_mapping']

    # Convert classification to BIO format
    bio_labels = [classifier.id2label[label] for label in classification]

    print(len(tokens), len(bio_labels), len(offset_mapping))
    spans, skip_count, trunc_count = build_spans_from_classification(tokens, bio_labels, offset_mapping)

    document = row['document']
    comment_id = row['comment_id']

    return [{'document': document, 'comment_id': comment_id, 'type': span[2], 'start': span[0], 'end': span[1]}
            for span in spans if span[0] != -1 and span[1] != -1]

ge2017_rules_test_pred_spans = pd.DataFrame(comments_val.apply(apply_span_classification, axis=1).explode().dropna().tolist())

8 8 8
17 17 17
12 12 12
51 51 51
32 32 32
26 26 26
11 11 11
57 57 57
18 18 18
61 61 61
93 93 93
3 3 3
4 4 4
12 12 12
17 17 17
14 14 14
5 5 5
15 15 15
54 54 54
3 3 3
8 8 8
26 26 26
8 8 8
8 8 8
25 25 25
8 8 8
14 14 14
25 25 25
4 4 4
47 47 47
16 16 16
7 7 7
10 10 10
16 16 16
23 23 23
4 4 4
46 46 46
4 4 4
16 16 16
8 8 8
11 11 11
6 6 6
7 7 7
21 21 21
13 13 13
51 51 51
20 20 20
35 35 35
20 20 20
15 15 15
16 16 16
13 13 13
28 28 28
11 11 11
12 12 12
87 87 87
7 7 7
8 8 8
26 26 26
7 7 7
40 40 40
6 6 6
58 58 58
11 11 11
4 4 4
6 6 6
30 30 30
19 19 19
54 54 54
36 36 36
6 6 6
12 12 12
24 24 24
4 4 4
11 11 11
7 7 7
9 9 9
17 17 17
33 33 33
22 22 22
9 9 9
23 23 23
17 17 17
9 9 9
6 6 6
14 14 14
137 137 137
10 10 10
6 6 6
10 10 10
6 6 6
8 8 8
9 9 9
19 19 19
9 9 9
4 4 4
31 31 31
10 10 10
8 8 8
6 6 6
24 24 24
4 4 4
11 11 11
4 4 4
34 34 34
9 9 9
21 21 21
5 5 5
10 10 10
16 16 16
8 8 8
8 8 8
50 50 50
18 18 18
5 5 5
7 7 7
4 4 4
5 5 5
8 8 8
9 9 9
19 19 19
23 23 23
6 6 6
3 3 3
22 22 22
15 15 15
10 10 10
5 5 5
4

In [14]:
print(f"F1 on ES data before postprocessing      {fine_grained_flausch_by_label(test_gold_spans, test_baseline_spans)['TOTAL']['STRICT']}")
print(f"F1 on ES data with GE2017 postprocessing {fine_grained_flausch_by_label(test_gold_spans, ge2017_rules_test_pred_spans)['TOTAL']['STRICT']}")

F1 on ES data before postprocessing      {'prec': 0.7572335920959774, 'rec': 0.7073170731707317, 'f1': 0.7314246762099522}
F1 on ES data with GE2017 postprocessing {'prec': 0.7649964714184898, 'rec': 0.7145682267633487, 'f1': 0.7389229720518065}


In [15]:
test_comments = test_data

test_comments['predicted_labels'] = None
test_comments['predicted_probs'] = None
test_comments['offset_mapping'] = None
test_comments['text_tokens'] = None

for idx in range(len(test_comments)): #range(15):
    row = test_comments.iloc[idx]
    text = row['comment']
    key = (row['document'], row['comment_id'])

    text_tokens = classifier.tokenizer.tokenize(text)
    test_comments.at[idx, 'text_tokens'] = text_tokens

    device = next(classifier.model.parameters()).device
    inputs = classifier.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)

    offset_mapping = inputs.pop('offset_mapping')
    test_comments.at[idx, 'offset_mapping'] = offset_mapping.cpu().numpy()[0].tolist()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Vorhersage
    with torch.no_grad():
        outputs = classifier.model(**inputs)

    predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    predicted_probs = torch.nn.functional.softmax(outputs.logits, dim=2)[0].cpu().numpy()
    test_comments.at[idx, 'predicted_labels'] = predicted_labels
    test_comments.at[idx, 'predicted_probs'] = predicted_probs

In [16]:
test_comments

Unnamed: 0,document,comment_id,comment,predicted_labels,predicted_probs,offset_mapping,text_tokens
0,NDY-004,1,Lol i love lochis,"[0, 3, 13, 13, 13, 13, 13, 13, 0]","[[0.9998258, 1.1969776e-05, 1.1567051e-06, 7.7...","[[0, 0], [0, 2], [2, 3], [4, 5], [6, 10], [11,...","[Lo, ##l, i, love, loc, ##hi, ##s]"
1,NDY-004,2,ihr singt voll gut :),"[0, 2, 12, 12, 12, 12, 12, 0]","[[0.99976057, 6.098534e-05, 4.611753e-06, 3.97...","[[0, 0], [0, 3], [4, 9], [10, 14], [15, 18], [...","[ihr, singt, voll, gut, :, )]"
2,NDY-004,3,Junge fick dich,"[0, 0, 0, 0, 0, 0]","[[0.9999795, 1.4359919e-06, 3.3691788e-07, 6.3...","[[0, 0], [0, 5], [6, 7], [7, 10], [11, 15], [0...","[Junge, f, ##ick, dich]"
3,NDY-004,4,Ihr seit die besten,"[0, 3, 13, 13, 13, 0]","[[0.9998192, 4.859976e-05, 2.3917532e-06, 6.91...","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 19], [0...","[Ihr, seit, die, besten]"
4,NDY-004,5,ihr seit die ALLER besten ich finde euch soooo...,"[0, 3, 13, 13, 13, 13, 13, 3, 13, 13, 13, 13, ...","[[0.99971765, 4.202885e-05, 1.5942048e-06, 9.6...","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 16], [1...","[ihr, seit, die, ALL, ##ER, besten, ich, finde..."
...,...,...,...,...,...,...,...
9224,NDY-203,522,hihi kannst du mich grüßen 💕 👋 😍 Achso wusstes...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 11, 0, 0, 0, 0,...","[[0.99931705, 3.137185e-05, 1.2843102e-06, 8.7...","[[0, 0], [0, 2], [2, 4], [5, 11], [12, 14], [1...","[hi, ##hi, kannst, du, mich, grü, ##ßen, [UNK]..."
9225,NDY-203,523,#Glocke aktiviert 👑 Ich liebe deine Videos 💍 💎...,"[0, 0, 0, 0, 0, 1, 11, 11, 11, 11, 11, 0, 0, 0...","[[0.9997228, 3.5343048e-05, 8.4241697e-07, 4.5...","[[0, 0], [0, 1], [1, 7], [8, 17], [18, 19], [2...","[#, Glocke, aktiviert, [UNK], Ich, liebe, dein..."
9226,NDY-203,524,Bist die beste ❤ Bitte Grüße mich 💕 ❤ 😘 😍,"[0, 3, 13, 13, 13, 0, 0, 0, 0, 1, 11, 11, 11, 0]","[[0.99983156, 3.9064038e-05, 1.023527e-06, 4.7...","[[0, 0], [0, 4], [5, 8], [9, 14], [15, 16], [1...","[Bist, die, beste, [UNK], Bitte, Grü, ##ße, mi..."
9227,NDY-203,525,"Hi Bonny ❤️ War letztens auf'm Flughafen , und...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.9999653, 2.7837193e-06, 2.9389835e-07, 4.8...","[[0, 0], [0, 2], [3, 7], [7, 8], [9, 11], [12,...","[Hi, Bonn, ##y, [UNK], War, letzten, ##s, auf,..."


In [17]:
test_comments_postprocessed = pd.DataFrame(test_comments.apply(apply_span_classification, axis=1).explode().dropna().tolist())

9 9 9
8 8 8
6 6 6
6 6 6
28 28 28
8 8 8
48 48 48
18 18 18
18 18 18
11 11 11
7 7 7
30 30 30
7 7 7
22 22 22
6 6 6
12 12 12
51 51 51
6 6 6
8 8 8
11 11 11
6 6 6
13 13 13
6 6 6
7 7 7
23 23 23
8 8 8
18 18 18
9 9 9
15 15 15
6 6 6
6 6 6
10 10 10
17 17 17
5 5 5
10 10 10
9 9 9
9 9 9
4 4 4
26 26 26
27 27 27
11 11 11
20 20 20
17 17 17
13 13 13
16 16 16
7 7 7
4 4 4
13 13 13
14 14 14
22 22 22
31 31 31
6 6 6
9 9 9
11 11 11
4 4 4
4 4 4
4 4 4
21 21 21
9 9 9
4 4 4
4 4 4
3 3 3
7 7 7
11 11 11
16 16 16
14 14 14
34 34 34
16 16 16
11 11 11
9 9 9
22 22 22
8 8 8
28 28 28
47 47 47
4 4 4
6 6 6
5 5 5
5 5 5
18 18 18
6 6 6
7 7 7
26 26 26
28 28 28
3 3 3
4 4 4
4 4 4
14 14 14
6 6 6
92 92 92
22 22 22
7 7 7
15 15 15
4 4 4
29 29 29
14 14 14
20 20 20
10 10 10
36 36 36
20 20 20
11 11 11
33 33 33
8 8 8
12 12 12
13 13 13
7 7 7
19 19 19
18 18 18
8 8 8
15 15 15
4 4 4
9 9 9
7 7 7
8 8 8
6 6 6
8 8 8
6 6 6
18 18 18
14 14 14
8 8 8
15 15 15
7 7 7
27 27 27
11 11 11
13 13 13
7 7 7
28 28 28
4 4 4
14 14 14
13 13 13
3 3 3
11 11 11
8 8 8
7

In [18]:
test_comments_postprocessed

Unnamed: 0,document,comment_id,type,start,end
0,NDY-004,1,affection declaration,0,17
1,NDY-004,2,compliment,0,21
2,NDY-004,4,affection declaration,0,19
3,NDY-004,5,affection declaration,0,25
4,NDY-004,5,affection declaration,26,56
...,...,...,...,...,...
4954,NDY-203,524,positive feedback,34,41
4955,NDY-203,525,positive feedback,229,237
4956,NDY-203,526,affection declaration,0,17
4957,NDY-203,526,positive feedback,30,63


In [19]:
test_comments_postprocessed.to_csv("./submissions/task2-predicted.csv", index=False)

In [20]:
!head -n 5 ./submissions/task2-predicted.csv

document,comment_id,type,start,end
NDY-004,1,affection declaration,0,17
NDY-004,2,compliment,0,21
NDY-004,4,affection declaration,0,19
NDY-004,5,affection declaration,0,25


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
!cp './submissions/task2-predicted.csv' './submissions/subtask2_submission1.csv'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
