Build an LLM from scratch

This revision is from 2024/07/01 21:21. You can Restore it.

Step 1: Plan and design the LLM

  1. Standard model design
  2. LLM has the ability to re-train itself, to hit the re-train button. (no human required)
  3. LLM is constantly being re-fed its training data, told to re-work and improve the training data, with prompt engineering to choose facts and statistics. (no human required). New data is also added to a seperate directory.
  4. The trainer is in the model. LLM re-works its training code as well to produce a better model. Developing the trainer means the LLM's ability to distinguish differences correctly, better from worse, yes from no and so on, successful compile vs errors, red from blue. Two copies and the LLM must choose which is better and update its training data.
  5. Demonstrator must be resource light enough for the LLM to perform these tasks.

Step 2: Eval Space

At its most basic human eval, more so the tools that give the LLM the ability to test, proof and rework training data. For instance, a code compiler or a training data compiler. It can run its generated code and get evaluation such as errors and then rework it. Provide the model more and more tools and means to better rw-work its training data.

Synthesize new data. A simulation space whch mimicks real world physics could be a universal space for performing evaluation. The simulation space based on real world physics and threads from the real world is designed to ground the evaluation and even spur synthetic data creation. Throw them in an NPC space. For instance we can put enough physics together to test wing designs. The LLM would eval a wing design and then determine perhaps a surperior wing design and then persist with that design at the mere of the user. Arguing its decision with facts and figures.

The focus shifts from the model to the trainer program.

Make the LLM

  1. Get the training datasets: Sources: Common Crawl, Wikipedia, books, articles, forums, public datasets (e.g., Project Gutenberg).
  2. Preprocess dataset: Data Preprocessing
    1. Tokenization: Split text into tokens (words, subwords, or characters).
    2. Normalization: Lowercase text, remove special characters, handle contractions, etc.
    3. Filtering: Remove non-text content, duplicates, and overly long or short texts.
    4. Encoding: Convert tokens to numerical representations using a tokenizer.
  3. Choose architecture for your LLM. Transformer-based models (e.g., GPT, BERT), Parameters: Define model size (number of layers, heads, hidden units).
  4. Training
  5. Evaludation

Use GPT2 tools:

from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Define configuration

config = GPT2Config(

vocab_size=50257,

n_positions=1024,

n_ctx=1024,

n_embd=768,

n_layer=12,

n_head=12,

n_inner=3072,

activation_function='gelu',

resid_pdrop=0.1,

embd_pdrop=0.1,

attn_pdrop=0.1,

layer_norm_epsilon=1e-5,

initializer_range=0.02,

) # Initialize tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare dataset

def load_dataset(file_path, tokenizer):

return TextDataset(

tokenizer=tokenizer,

file_path=file_path,

block_size=128,

)

train_dataset = load_dataset("path/to/train.txt", tokenizer)

val_dataset = load_dataset("path/to/val.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(

tokenizer=tokenizer,

mlm=False,

) # Initialize model

model = GPT2LMHeadModel(config)

model.resize_token_embeddings(len(tokenizer))

# Set training arguments

training_args = TrainingArguments(

output_dir="./results",

overwrite_output_dir=True,

num_train_epochs=3,

per_device_train_batch_size=2,

per_device_eval_batch_size=2,

save_steps=10_000,

save_total_limit=2,

prediction_loss_only=True,

logging_dir='./logs',

) # Create trainer and train

trainer = Trainer(

model=model,

args=training_args,

data_collator=data_collator,

train_dataset=train_dataset,

eval_dataset=val_dataset,

)

trainer.train()

# Save the model

model.save_pretrained("./trained_model")

tokenizer.save_pretrained("./trained_model")

Another, from scratch:

import torch

import torch.nn as nn

from transformers import PreTrainedModel, PretrainedConfig, Trainer, TrainingArguments

from datasets import load_dataset

from tokenizers import Tokenizer, models, pre_tokenizers, trainers

from transformers import PreTrainedTokenizerFast

# Define the model architecture

class NewLM(PreTrainedModel):

def __init__(self, config):

super().__init__(config)

self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)

self.transformer = nn.TransformerEncoder(

nn.TransformerEncoderLayer(

d_model=config.hidden_size,

nhead=config.num_heads,

dim_feedforward=config.intermediate_size,

dropout=config.hidden_dropout_prob

),

num_layers=config.num_hidden_layers

)

self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)

def forward(self, input_ids, attention_mask=None):

x = self.embedding(input_ids)

if attention_mask is not None:

x = x.permute(1, 0, 2) # TransformerEncoder expects seq_len first

x = self.transformer(x, src_key_padding_mask=attention_mask)

x = x.permute(1, 0, 2) # Change back to batch first

else:

x = x.permute(1, 0, 2)

x = self.transformer(x)

x = x.permute(1, 0, 2)

return self.lm_head(x)

# Create a custom configuration

class NewLMConfig(PretrainedConfig):

model_type = "new_lm"

def __init__(

self,

vocab_size=30000,

hidden_size=256,

num_hidden_layers=6,

num_heads=8,

intermediate_size=1024,

hidden_dropout_prob=0.1,

max_position_embeddings=512,

**kwargs

):

super().__init__(**kwargs)

self.vocab_size = vocab_size

self.hidden_size = hidden_size

self.num_hidden_layers = num_hidden_layers

self.num_heads = num_heads

self.intermediate_size = intermediate_size

self.hidden_dropout_prob = hidden_dropout_prob

self.max_position_embeddings = max_position_embeddings

# Train tokenizer

def train_tokenizer(texts):

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

tokenizer.train_from_iterator(texts, trainer)

return PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# Load and preprocess data

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

texts = dataset["text"]

# Train tokenizer

tokenizer = train_tokenizer(texts)

# Tokenize dataset

def tokenize_function(examples):

return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Initialize model

config = NewLMConfig(vocab_size=len(tokenizer))

model = NewLM(config)

# Set training arguments

training_args = TrainingArguments(

output_dir="./results",

overwrite_output_dir=True,

num_train_epochs=3,

per_device_train_batch_size=8,

save_steps=10_000,

save_total_limit=2,

prediction_loss_only=True,

logging_dir='./logs',

) # Define data collator

def data_collator(features):

return {

"input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in features]),

"attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in features]),

"labels": torch.stack([torch.tensor(f["input_ids"]) for f in features]),

}

# Create trainer and train

trainer = Trainer(

model=model,

args=training_args,

train_dataset=tokenized_dataset,

data_collator=data_collator,

)

trainer.train()

# Save the model and tokenizer

model.save_pretrained("./new_lm")

tokenizer.save_pretrained("./new_lm")

Rework Training Data

  1. Load the data.
  2. Initialize the LLM.
  3. Create a loop to process the data.
  4. In each iteration, select a random piece of data.
  5. Use the LLM to generate a new version of the data.
  6. Replace the original data with the generated data.
  7. Repeat until all data has been processed.

import random

import torch

from transformers import BertTokenizer, BertForMaskedLM

# Load your data

data = ["example sentence 1", "example sentence 2", ...]

# Initialize the LLM (BERT)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)

# Loop through the data

for i in range(len(data)):

# Select a random piece of data

idx = random.randint(0, len(data) - 1)

input_text = data[idx]

# Tokenize the input text and mask a random word

inputs = tokenizer(input_text, return_tensors="pt").to(device)

masked_index = random.choice([i for i, token in enumerate(inputs["input_ids"][0]) if token.item() != tokenizer.pad_token_id])

inputs["input_ids"][0][masked_index] = tokenizer.mask_token_id

# Generate a new version of the data

outputs = model(**inputs)

predictions = outputs.logits

predicted_index = torch.argmax(predictions[0, masked_index]).item()

predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

new_text = input_text[:masked_index] + predicted_token + input_text[masked_index + 1:]

# Replace the original data with the generated data

data[idx] = new_text

# Print the modified data

print(data)

When complete retrain the model and repeat. After the loop extract, run the model training script. Turn the model loading into a function and reload the new model and repeat endlessly. Throw new training data in the directory it uses or have two directories, an orig and rework directory.

  

📝 📜 ⏱️ ⬆️