nicholasKluge commited on
Commit
8f0da75
·
1 Parent(s): 4f71d07

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -40
README.md CHANGED
@@ -15,7 +15,7 @@ tags:
15
 
16
  TeenyTinyLlama is a series of small foundational models trained on Portuguese.
17
 
18
- This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-162m) fine-tuned on the [FAQUAD dataset](https://huggingface.co/datasets/ruanchaves/faquad-nli).
19
 
20
  ## Reproducing
21
 
@@ -25,46 +25,24 @@ This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.
25
 
26
  import evaluate
27
  import numpy as np
28
- from huggingface_hub import login
29
  from datasets import load_dataset, Dataset, DatasetDict
30
  from transformers import AutoTokenizer, DataCollatorWithPadding
31
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
32
 
33
- # Basic fine-tuning arguments
34
- token="your_token"
35
- task="ruanchaves/faquad-nli"
36
- model_name="nicholasKluge/Teeny-tiny-llama-162m"
37
- output_dir="checkpoint"
38
- learning_rate=4e-5
39
- per_device_train_batch_size=16
40
- per_device_eval_batch_size=16
41
- num_train_epochs=3
42
- weight_decay=0.01
43
- evaluation_strategy="epoch"
44
- save_strategy="epoch"
45
- hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-faquad"
46
-
47
- # Login on the hub to load and push
48
- login(token=token)
49
-
50
  # Load the task
51
- dataset = load_dataset(task)
52
 
53
  # Create a `ModelForSequenceClassification`
54
  model = AutoModelForSequenceClassification.from_pretrained(
55
- model_name,
56
  num_labels=2,
57
  id2label={0: "UNSUITABLE", 1: "SUITABLE"},
58
  label2id={"UNSUITABLE": 0, "SUITABLE": 1}
59
  )
60
 
61
- tokenizer = AutoTokenizer.from_pretrained(model_name)
62
-
63
- # If model does not have a pad_token, we need to add it
64
- #tokenizer.pad_token = tokenizer._eos_token
65
- #model.config.pad_token_id = model.config.eos_token_id
66
 
67
- # Preprocess if needed
68
  train = dataset['train'].to_pandas()
69
  train['text'] = train['question'] + tokenizer.bos_token + train['answer'] + tokenizer.eos_token
70
  train = train[['text', 'label']]
@@ -82,7 +60,7 @@ dataset = DatasetDict({
82
  "test": test
83
  })
84
 
85
- # Pre process the dataset
86
  def preprocess_function(examples):
87
  return tokenizer(examples["text"], truncation=True)
88
 
@@ -102,20 +80,18 @@ def compute_metrics(eval_pred):
102
 
103
  # Define training arguments
104
  training_args = TrainingArguments(
105
- output_dir=output_dir,
106
- learning_rate=learning_rate,
107
- per_device_train_batch_size=per_device_train_batch_size,
108
- per_device_eval_batch_size=per_device_eval_batch_size,
109
- num_train_epochs=num_train_epochs,
110
- weight_decay=weight_decay,
111
- evaluation_strategy=evaluation_strategy,
112
- save_strategy=save_strategy,
113
  load_best_model_at_end=True,
114
  push_to_hub=True,
115
- hub_token=token,
116
- hub_private_repo=True,
117
- hub_model_id=hub_model_id,
118
- tf32=True,
119
  )
120
 
121
  # Define the Trainer
 
15
 
16
  TeenyTinyLlama is a series of small foundational models trained on Portuguese.
17
 
18
+ This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-162m) fine-tuned on the [FaQuAD-NLI dataset](https://huggingface.co/datasets/ruanchaves/faquad-nli).
19
 
20
  ## Reproducing
21
 
 
25
 
26
  import evaluate
27
  import numpy as np
 
28
  from datasets import load_dataset, Dataset, DatasetDict
29
  from transformers import AutoTokenizer, DataCollatorWithPadding
30
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Load the task
33
+ dataset = load_dataset("ruanchaves/faquad-nli)
34
 
35
  # Create a `ModelForSequenceClassification`
36
  model = AutoModelForSequenceClassification.from_pretrained(
37
+ "nicholasKluge/TeenyTinyLlama-162m",
38
  num_labels=2,
39
  id2label={0: "UNSUITABLE", 1: "SUITABLE"},
40
  label2id={"UNSUITABLE": 0, "SUITABLE": 1}
41
  )
42
 
43
+ tokenizer = AutoTokenizer.from_pretrained("nicholasKluge/TeenyTinyLlama-162m")
 
 
 
 
44
 
45
+ # Format the dataset
46
  train = dataset['train'].to_pandas()
47
  train['text'] = train['question'] + tokenizer.bos_token + train['answer'] + tokenizer.eos_token
48
  train = train[['text', 'label']]
 
60
  "test": test
61
  })
62
 
63
+ # Preprocess the dataset
64
  def preprocess_function(examples):
65
  return tokenizer(examples["text"], truncation=True)
66
 
 
80
 
81
  # Define training arguments
82
  training_args = TrainingArguments(
83
+ output_dir="checkpoints",
84
+ learning_rate=4e-5,
85
+ per_device_train_batch_size=16,
86
+ per_device_eval_batch_size=16,
87
+ num_train_epochs=3,
88
+ weight_decay=0.01,
89
+ evaluation_strategy="epochs",
90
+ save_strategy="epochs",
91
  load_best_model_at_end=True,
92
  push_to_hub=True,
93
+ hub_token="your_token_here",
94
+ hub_model_id="username/model-ID"
 
 
95
  )
96
 
97
  # Define the Trainer