# Working with a Large Language Model (LLM)

## Prerequisites

In [1]:
import math
import json
import pickle
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate
from transformers import pipeline

# Get candidate labels
with open("packing_label_structure.json", "r") as file:
    candidate_labels = json.load(file)
keys_list = list(candidate_labels.keys())

# Load test data (list of dictionaries)
with open("test_data.json", "r") as file:
    packing_data = json.load(file)
# Extract trip descriptions and classification (trip_types)
trip_descriptions = [trip['description'] for trip in packing_data]
trip_types = [trip['trip_types'] for trip in packing_data]

**All trip descriptions**

In [2]:
for i, item in enumerate(trip_descriptions):
    print(i, ".", item, "\n")
    for elem in trip_types[i]:
        print(elem)
    print("\n")

0 . I am planning a trip to Greece with my boyfriend, where we will visit two islands. We have booked an apartment on each island for a few days and plan to spend most of our time relaxing. Our main goals are to enjoy the beach, try delicious local food, and possibly go on a hike—if it’s not too hot. We will be relying solely on public transport. We’re in our late 20s and traveling from the Netherlands. 

beach vacation
['swimming', 'going to the beach', 'relaxing', 'hiking']
warm destination / summer
lightweight (but comfortable)
casual
indoor
no own vehicle
no special conditions to consider
7+ days


1 . We are a couple in our thirties traveling to Vienna for a three-day city trip. We’ll be staying at a friend’s house and plan to explore the city by sightseeing, strolling through the streets, visiting markets, and trying out great restaurants and cafés. We also hope to attend a classical music concert. Our journey to Vienna will be by train. 

city trip
['sightseeing']
variable weath

**Functions**

In [3]:
def pred_trip(model_name, trip_descr, trip_type, cut_off = 0.5):
    """
    Classifies trip
    
    Parameters:
    model_name: name of hugging-face model
    trip_descr: text describing the trip
    trip_type: true trip classification
    cut_off: cut_off for choosing activities

    Returns:
    pd Dataframe: with class predictions and true values
    """
    
    classifier = pipeline("zero-shot-classification", model=model_name)
    df = pd.DataFrame(columns=['superclass', 'pred_class'])
    for i, key in enumerate(keys_list):
        print(i)
        if key == 'activities':
            result = classifier(trip_descr, candidate_labels[key], multi_label=True)
            indices = [i for i, score in enumerate(result['scores']) if score > cut_off]
            classes = [result['labels'][i] for i in indices]
        else:
            result = classifier(trip_descr, candidate_labels[key])
            classes = result["labels"][0]
        df.loc[i] = [key, classes]
    df['true_class'] = trip_type
    return df

In [4]:
def perf_measure(df):
    """
    Calculates performance measures:
    Accuracy of classification excluding activities superclass
    Percentage of correctly identified activities (#correctly predicted/#true activities)
    Percentage of wrongly identified activities (#wrongly predicted/#predicted activities)

    Parameters:
    df: pd Dataframe returned from pred_trip()

    Returns:
    pd Dataframe: containing performance measures
    """
    
    df['same_value'] = df['pred_class'] == df['true_class']
    correct = sum(df.loc[df.index != 1, 'same_value'])
    total = len(df['same_value'])
    accuracy = correct/total
    pred_class = df.loc[df.index == 1, 'pred_class'].iloc[0]
    true_class = df.loc[df.index == 1, 'true_class'].iloc[0]
    correct = [label for label in pred_class if label in true_class]
    num_correct = len(correct)
    correct_perc = num_correct/len(true_class)
    num_pred = len(pred_class)
    if num_pred == 0:
        wrong_perc = math.nan
    else:
        wrong_perc = (num_pred - num_correct)/num_pred
    df_perf = pd.DataFrame({
    'accuracy': [accuracy],
    'true_ident': [correct_perc],
    'false_pred': [wrong_perc]
    })
    return(df_perf)

## Make predictions for many models and trip descriptions

Provide a list of candidate models and apply them to the test data.

In [22]:
# List of Hugging Face model names
# trending...
"""
model_names = [
    "facebook/bart-large-mnli",
    "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
    "cross-encoder/nli-deberta-v3-base",
    "cross-encoder/nli-deberta-v3-large",
    "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    "joeddav/bart-large-mnli-yahoo-answers",
    "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
    "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    "valhalla/distilbart-mnli-12-1",
    #"joeddav/xlm-roberta-large-xnli" # keeps giving errors
]
"""

# most downloads
model_names = [
    #"facebook/bart-large-mnli",
    #"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
    #"sileod/deberta-v3-base-tasksource-nli",
    #"vicgalle/xlm-roberta-large-xnli-anli", # gives errors
    #"joeddav/xlm-roberta-large-xnli",# errors
    #"chuhac/BiomedCLIP-vit-bert-hf",# errors
    "pongjin/roberta_with_kornli",
    #"joeddav/bart-large-mnli-yahoo-answers",
    #"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    #"valhalla/distilbart-mnli-12-1",
    "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
]


# Apply each model to the test data
for model_name in model_names:
    print(f"\nUsing model: {model_name}")
    result_list = []
    performance = pd.DataFrame(columns=['accuracy', 'true_ident', 'false_pred'])
    
    start_time = time.time()
    for i in range(len(trip_descriptions)):
        current_trip = trip_descriptions[i]
        current_type = trip_types[i]
        df = pred_trip(model_name, current_trip, current_type, cut_off = 0.5)
        performance = pd.concat([performance, perf_measure(df)])
        result_list.append(df)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Extract and combine columns identifying correct prediction (for each trip)
    sv_columns = [df['same_value'] for df in result_list]
    sv_columns.insert(0, result_list[0]['superclass'])
    sv_df = pd.concat(sv_columns, axis=1)
    # Compute accuracy per superclass
    row_means = sv_df.iloc[:, 1:].mean(axis=1)
    df_row_means = pd.DataFrame({
        'superclass': sv_df['superclass'],
        'accuracy': row_means
    })
    # Compute performance measures per trip (mean for each column of performance table)
    column_means = performance.mean()
    # Save results
    model = model_name.replace("/", "-")
    model_result = {
        'model': model,
        'predictions': result_list,
        'performance': performance,
        'perf_summary': column_means,
        'perf_superclass': df_row_means,
        'elapsed_time': elapsed_time
    }
    filename = os.path.join('results', f'{model}_results.pkl')
    with open(filename, 'wb') as f:
        pickle.dump(model_result, f)






Using model: pongjin/roberta_with_kornli


config.json:   0%|          | 0.00/985 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/415 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8

Using model: MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/6303b5f71dd5d3c62482f3e9/11250a388e769f8d819bc3d9b55590ab43bde78b2d4f37df57bc5603c5dea0b0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251104%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251104T105145Z&X-Amz-Expires=3600&X-Amz-Signature=603bdf8dd3ba17470e9f30bd3b31141b09a02d57e6cf00b63996b798d985c323&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1762257105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2MjI1NzEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MzAzYjVmNzFkZDVkM2M2MjQ4MmYzZTkvMTEyNTBhMzg4ZTc2OWY4ZDgxOWJjM2Q5YjU1NTkwYWI0M2JkZTc4YjJkNGYzN2RmNTdiYzU2MDNjNWRlYTBiMCoifV19&Signature=s7MbLznU9vhypkS2WueXkH%7E41QifR0qWT8YHXKdMNPheh483kMt

model.safetensors:  26%|##6       | 147M/558M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/6303b5f71dd5d3c62482f3e9/11250a388e769f8d819bc3d9b55590ab43bde78b2d4f37df57bc5603c5dea0b0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251104%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251104T105145Z&X-Amz-Expires=3600&X-Amz-Signature=603bdf8dd3ba17470e9f30bd3b31141b09a02d57e6cf00b63996b798d985c323&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1762257105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2MjI1NzEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MzAzYjVmNzFkZDVkM2M2MjQ4MmYzZTkvMTEyNTBhMzg4ZTc2OWY4ZDgxOWJjM2Q5YjU1NTkwYWI0M2JkZTc4YjJkNGYzN2RmNTdiYzU2MDNjNWRlYTBiMCoifV19&Signature=s7MbLznU9vhypkS2WueXkH%7E41QifR0qWT8YHXKdMNPheh483kMt

model.safetensors:  26%|##6       | 147M/558M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/6303b5f71dd5d3c62482f3e9/11250a388e769f8d819bc3d9b55590ab43bde78b2d4f37df57bc5603c5dea0b0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251104%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251104T105145Z&X-Amz-Expires=3600&X-Amz-Signature=603bdf8dd3ba17470e9f30bd3b31141b09a02d57e6cf00b63996b798d985c323&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1762257105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2MjI1NzEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MzAzYjVmNzFkZDVkM2M2MjQ4MmYzZTkvMTEyNTBhMzg4ZTc2OWY4ZDgxOWJjM2Q5YjU1NTkwYWI0M2JkZTc4YjJkNGYzN2RmNTdiYzU2MDNjNWRlYTBiMCoifV19&Signature=s7MbLznU9vhypkS2WueXkH%7E41QifR0qWT8YHXKdMNPheh483kMt

model.safetensors:  26%|##6       | 147M/558M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/6303b5f71dd5d3c62482f3e9/11250a388e769f8d819bc3d9b55590ab43bde78b2d4f37df57bc5603c5dea0b0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251104%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251104T105145Z&X-Amz-Expires=3600&X-Amz-Signature=603bdf8dd3ba17470e9f30bd3b31141b09a02d57e6cf00b63996b798d985c323&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1762257105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2MjI1NzEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MzAzYjVmNzFkZDVkM2M2MjQ4MmYzZTkvMTEyNTBhMzg4ZTc2OWY4ZDgxOWJjM2Q5YjU1NTkwYWI0M2JkZTc4YjJkNGYzN2RmNTdiYzU2MDNjNWRlYTBiMCoifV19&Signature=s7MbLznU9vhypkS2WueXkH%7E41QifR0qWT8YHXKdMNPheh483kMt

model.safetensors:  26%|##6       | 147M/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


0
1
2
3
4
5
6
7
8


## Load and compare results

In [5]:
# Folder where .pkl files are saved
results_dir = 'results'

# Dictionary to store all loaded results
all_results = {}

# Loop through all .pkl files in the folder
for filename in os.listdir(results_dir):
    if filename.endswith('.pkl'):
        model_name = filename.replace('_results.pkl', '')  # Extract model name
        file_path = os.path.join(results_dir, filename)
        
        # Load the result
        with open(file_path, 'rb') as f:
            result = pickle.load(f)
            all_results[model_name] = result

# Compare performance across models
for model, data in all_results.items():
    print(f"Model: {model}")
    print(f"Performance Summary:\n{data['perf_summary']}")
    print("-" * 40)


Model: cross-encoder-nli-deberta-v3-base
Performance Summary:
accuracy      0.444444
true_ident    0.533333
false_pred    0.712500
dtype: float64
----------------------------------------
Model: joeddav-bart-large-mnli-yahoo-answers
Performance Summary:
accuracy      0.355556
true_ident    0.650000
false_pred    0.553792
dtype: float64
----------------------------------------
Model: cross-encoder-nli-deberta-v3-large
Performance Summary:
accuracy      0.466667
true_ident    0.566667
false_pred    0.541667
dtype: float64
----------------------------------------
Model: MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli
Performance Summary:
accuracy      0.611111
true_ident    0.841667
false_pred    0.546667
dtype: float64
----------------------------------------
Model: MoritzLaurer-mDeBERTa-v3-base-mnli-xnli
Performance Summary:
accuracy      0.455556
true_ident    0.408333
false_pred    0.481250
dtype: float64
----------------------------------------
Model: MoritzLaurer-deberta-v3-

In [6]:
print(len(all_results))

12


**Identify trips that are difficult to predict**

Per model

In [7]:
def get_difficult_trips(model_result, cut_off = 0.6):
    """
    """
    # model_result is a dict with dict_keys(['model', 'predictions', 
    # 'performance', 'perf_summary', 'perf_superclass', 'elapsed_time'])
    # get performance dataframe and repair index
    df = model_result['performance'].reset_index(drop=True)
    # find index of trips whose accuracy is below cut_off
    index_result = df[df['accuracy'] < cut_off].index
    return(index_result)

# dictionary of trips that have accuracy below cut_off default
difficult_trips_dict = {}
for model, data in all_results.items():
    difficult_trips_dict[data["model"]] = get_difficult_trips(data)

for key, value in difficult_trips_dict.items():
    print(f"{key}: {value}\n")

cross-encoder-nli-deberta-v3-base: Index([0, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

joeddav-bart-large-mnli-yahoo-answers: RangeIndex(start=0, stop=10, step=1)

cross-encoder-nli-deberta-v3-large: Index([0, 1, 2, 3, 4, 6, 7, 8, 9], dtype='int64')

MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli: Index([2, 3, 5, 6, 7, 8, 9], dtype='int64')

MoritzLaurer-mDeBERTa-v3-base-mnli-xnli: RangeIndex(start=0, stop=10, step=1)

MoritzLaurer-deberta-v3-large-zeroshot-v2.0: Index([1, 2, 3, 5, 6, 7, 9], dtype='int64')

pongjin-roberta_with_kornli: RangeIndex(start=0, stop=10, step=1)

sileod-deberta-v3-base-tasksource-nli: Index([0, 2, 3, 5, 6], dtype='int64')

MoritzLaurer-mDeBERTa-v3-base-xnli-multilingual-nli-2mil7: Index([0, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

facebook-bart-large-mnli: RangeIndex(start=0, stop=10, step=1)

valhalla-distilbart-mnli-12-1: Index([0, 1, 2, 3, 4, 7, 9], dtype='int64')

MoritzLaurer-DeBERTa-v3-base-mnli-fever-anli: Index([0, 2, 3, 4, 6, 7], dtype='int6

For all models

In [8]:
# Which trips are difficult for all models
common = set.intersection(*(set(v) for v in difficult_trips_dict.values()))
for index in common:
    print(index, ".", trip_descriptions[index], "\n")
    for item in trip_types[index]:
        print(item)
    print("\n")

2 . My partner and I are traveling to the Netherlands and Germany to spend Christmas with our family. We are in our late twenties and will start our journey with a two-hour flight to the Netherlands. From there, we will take a 5.5-hour train ride to northern Germany. 

city trip
['relaxing']
cold destination / winter
lightweight (but comfortable)
casual
indoor
no own vehicle
no special conditions to consider
7+ days


3 . I’m in my twenties and will be traveling to Peru for three weeks. I’m going solo but will meet up with a friend to explore the Sacred Valley and take part in a Machu Picchu tour. We plan to hike, go rafting, and explore the remnants of the ancient Inca Empire. We’re also excited to try Peruvian cuisine and immerse ourselves in the local culture. Depending on our plans, we might also visit the rainforest region, such as Tarapoto. I’ll be flying to Peru on a long-haul flight and will be traveling in August. 

cultural exploration
['sightseeing', 'hiking', 'rafting']
var

**Identify superclasses that are difficult to predict**

Per model

In [9]:
def get_difficult_superclasses(model_result, cut_off = 0.6):
    # model_result is a dict with dict_keys(['model', 'predictions', 
    # 'performance', 'perf_summary', 'perf_superclass', 'elapsed_time'])
    df = model_result["perf_superclass"]
    # find superclass whose accuracy is below cut_off
    diff_spc = list(df[df['accuracy'] < cut_off]["superclass"])
    return(diff_spc)

# make dictionary of superclasses that have accuracy below cut_off default
difficult_superclass_dict = {}
for model, data in all_results.items():
    difficult_superclass_dict[data["model"]] = get_difficult_superclasses(data)

for key, value in difficult_superclass_dict.items():
    print(f"{key}: {value}\n")

cross-encoder-nli-deberta-v3-base: ['activities', 'climate_or_season', 'style_or_comfort', 'special_conditions']

joeddav-bart-large-mnli-yahoo-answers: ['activities', 'climate_or_season', 'style_or_comfort', 'dress_code', 'accommodation', 'transportation', 'special_conditions']

cross-encoder-nli-deberta-v3-large: ['activities', 'climate_or_season', 'style_or_comfort', 'transportation', 'special_conditions']

MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli: ['activities', 'style_or_comfort']

MoritzLaurer-mDeBERTa-v3-base-mnli-xnli: ['activities', 'style_or_comfort', 'accommodation', 'special_conditions', 'trip_length_days']

MoritzLaurer-deberta-v3-large-zeroshot-v2.0: ['activities', 'climate_or_season', 'style_or_comfort', 'accommodation', 'special_conditions']

pongjin-roberta_with_kornli: ['activity_type', 'activities', 'climate_or_season', 'style_or_comfort', 'dress_code', 'accommodation', 'transportation', 'special_conditions', 'trip_length_days']

sileod-deberta-v3-bas

For all models

In [10]:
# Which trips are difficult for all models
common = set.intersection(*(set(v) for v in difficult_superclass_dict.values()))
print(common)

{'activities', 'style_or_comfort'}


In [11]:
# Look at particular predicitons in detail
# print(all_results["joeddav-bart-large-mnli-yahoo-answers"])

**Comparing models**

In [12]:
pd.set_option('display.max_columns', None)       # show all columns
pd.set_option('display.max_colwidth', None)     # do not truncate cell contents
pd.set_option('display.width', 200)  

perf_table = []
# fill in for loop with perf_summary per model
for model, result in all_results.items():
    row = pd.DataFrame(result["perf_summary"]).T
    # row["model"] = model
    row.insert(0, "model", model)  # insert as first column
    perf_table.append(row)
# Concatenate all into one table
df_all = pd.concat(perf_table, ignore_index=True)
df = df_all.sort_values(by="accuracy", ascending=False).reset_index(drop=True)



print(df)
#print(type(df_all))
    

# rank by accuracy


                                                        model  accuracy  true_ident  false_pred
0    MoritzLaurer-DeBERTa-v3-large-mnli-fever-anli-ling-wanli  0.611111    0.841667    0.546667
1                       sileod-deberta-v3-base-tasksource-nli  0.566667    0.700000    0.551667
2                MoritzLaurer-DeBERTa-v3-base-mnli-fever-anli  0.522222    0.841667    0.572381
3                 MoritzLaurer-deberta-v3-large-zeroshot-v2.0  0.500000    0.325000    0.500000
4                               valhalla-distilbart-mnli-12-1  0.500000    0.300000    0.533333
5   MoritzLaurer-mDeBERTa-v3-base-xnli-multilingual-nli-2mil7  0.488889    0.833333    0.688373
6                          cross-encoder-nli-deberta-v3-large  0.466667    0.566667    0.541667
7                                    facebook-bart-large-mnli  0.466667    0.708333    0.400000
8                     MoritzLaurer-mDeBERTa-v3-base-mnli-xnli  0.455556    0.408333    0.481250
9                           cross-encode

In [25]:
# return packing list additionally to classes
# Load packing item data
with open("packing_templates_self_supported_offgrid_expanded.json", "r") as file:
    packing_items = json.load(file)

# function and gradio app
def classify(model_name, trip_descr, cut_off = 0.5):
    classifier = pipeline("zero-shot-classification", model=model_name)
    ## Create and fill dataframe with class predictions
    df = pd.DataFrame(columns=['superclass', 'pred_class'])
    for i, key in enumerate(keys_list):
        if key == 'activities':
            result = classifier(trip_descr, candidate_labels[key], multi_label=True)
            indices = [i for i, score in enumerate(result['scores']) if score > cut_off]
            classes = [result['labels'][i] for i in indices]
        else:
            result = classifier(trip_descr, candidate_labels[key])
            classes = result["labels"][0]
        df.loc[i] = [key, classes]

    ## Look up and return list of items to pack based on class predictions
    # make list from dataframe column
    all_classes = [elem for x in df["pred_class"] for elem in (x if isinstance(x, list) else [x])]
    # look up packing items for each class/key
    list_of_list_of_items = [packing_items.get(k, []) for k in all_classes]
    # combine lists and remove doubble entries
    flat_unique = []
    for sublist in list_of_list_of_items:
        for item in sublist:
            if item not in flat_unique:
                flat_unique.append(item)
    # sort alphabetically to notice duplicates
    sorted_list = sorted(flat_unique)  
    return df, sorted_list

In [26]:
# Access the first trip description
first_trip = trip_descriptions[0]
tmp = classify("facebook/bart-large-mnli", first_trip )
print(tmp)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


(           superclass                              pred_class
0       activity_type                          beach vacation
1          activities  [going to the beach, relaxing, hiking]
2   climate_or_season               warm destination / summer
3    style_or_comfort                              minimalist
4          dress_code                                  casual
5       accommodation                    huts with half board
6      transportation                          no own vehicle
7  special_conditions               off-grid / no electricity
8    trip_length_days                                 7+ days, ['1 set kleding voor elke situatie', 'EHBO-set', 'USB-hub (voor meerdere devices)', 'aantal maaltijden/snacks afgestemd op duur', 'alles-in-één zeep', 'back-up verlichting (bijv. kleine zaklamp)', 'blarenpleisters of tape', 'boek of e-reader', 'comfortabele kleding', 'compacte tandenborstel', 'contant geld voor betalingen', 'dagrugzak', 'extra kledinglaag', 'extra opladerkabe

# Use gradio for user input

In [2]:
# Prerequisites
from transformers import pipeline
import json
import pandas as pd
import gradio as gr

# get candidate labels
with open("packing_label_structure.json", "r") as file:
    candidate_labels = json.load(file)
keys_list = list(candidate_labels.keys())

# Load test data (in list of dictionaries)
with open("test_data.json", "r") as file:
    packing_data = json.load(file)

# Load packing item data
with open("packing_templates_self_supported_offgrid_expanded.json", "r") as file:
    packing_items = json.load(file)

In [3]:
# function and gradio app
def classify(model_name, trip_descr, cut_off = 0.5):
    classifier = pipeline("zero-shot-classification", model=model_name)
    ## Create and fill dataframe with class predictions
    df = pd.DataFrame(columns=['superclass', 'pred_class'])
    for i, key in enumerate(keys_list):
        if key == 'activities':
            result = classifier(trip_descr, candidate_labels[key], multi_label=True)
            indices = [i for i, score in enumerate(result['scores']) if score > cut_off]
            classes = [result['labels'][i] for i in indices]
        else:
            result = classifier(trip_descr, candidate_labels[key])
            classes = result["labels"][0]
        df.loc[i] = [key, classes]

    ## Look up and return list of items to pack based on class predictions
    # make list from dataframe column
    all_classes = [elem for x in df["pred_class"] for elem in (x if isinstance(x, list) else [x])]
    # look up packing items for each class/key
    list_of_list_of_items = [packing_items.get(k, []) for k in all_classes]
    # combine lists and remove doubble entries
    flat_unique = []
    for sublist in list_of_list_of_items:
        for item in sublist:
            if item not in flat_unique:
                flat_unique.append(item)
    # sort alphabetically to notice duplicates
    sorted_list = sorted(flat_unique)  
    return df, "\n".join(sorted_list)

demo = gr.Interface(
    fn=classify,
    inputs=[
        gr.Textbox(label="Model name", value = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"),
        gr.Textbox(label="Trip description"),
        gr.Number(label="Activity cut-off", value = 0.5),
    ],
    # outputs="dataframe",
    outputs=[gr.Dataframe(label="DataFrame"), gr.Textbox(label="List of words")],
    title="Trip classification",
    description="Enter a text describing your trip",
)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()


Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [32]:
print(first_trip)

I am planning a trip to Greece with my boyfriend, where we will visit two islands. We have booked an apartment on each island for a few days and plan to spend most of our time relaxing. Our main goals are to enjoy the beach, try delicious local food, and possibly go on a hike—if it’s not too hot. We will be relying solely on public transport. We’re in our late 20s and traveling from the Netherlands.


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


**Check for duplicate entries, which to combine?**

In [3]:
print(type(packing_items))

<class 'dict'>


In [9]:
# Load packing item data
with open("packing_templates_self_supported_offgrid_expanded.json", "r") as file:
    packing_items = json.load(file)

unique_sorted = sorted({item for values in packing_items.values() for item in values})

for item in unique_sorted:
    print(item)


1 set of clothing for every situation
GPS or offline maps
Gore‑Tex clothing
Gore‑Tex jacket and pants
MiFi router or portable WiFi hotspot
SUP board and paddle
USB hub (for multiple devices)
WiFi hotspot or local SIM card
accessories
activity book or tablet with films
airbag backpack (if available)
all‑in‑one soap
at least 2 liters of water storage per person
avalanche beacon (transceiver)
baby monitor (for staying at location)
backpack
backup lighting (e.g. small flashlight)
bags for waste
bait / lures
bank card / cash
beach bag
beach chair
beach towel
belay device
bike light and lock
bike or rental bike
biodegradable soap + sponge
bivvy bag or tarp
blister plasters or tape
board leash
book / meditation material
book or e‑reader
boots or waders
bottles and food (if applicable)
breathable thermal clothing
buff or neck warmer
business cards / documents
camera + lenses
camera or smartphone
camping gear (if staying overnight)
camping table (optional)
cap or hat
car documents
cash / card
c