Build an LLM from scratch

This revision is from 2024/07/01 18:30. You can Restore it.

Step 1: Plan and design the LLM

  1. Standard model design
  2. LLM has the ability to re-train itself, to hit the re-train button. (no human required)
  3. LLM is constantly re-working its training data to improve its training data. (no human required)

Note: focus on the LLM's ability to distinguish differences correctly, better from worse, yes from no and so on, successful compile vs errors, red from blue.

Demonstrator must be resource light enough for the LLM to perform these tasks.

Step 2: Eval Space

The tools that give the LLM the ability to test, proof and rework training data. For instance, a code compiler or a training data compiler. A versus B thinking.

Make the LLM

  1. Get the training datasets: Sources: Common Crawl, Wikipedia, books, articles, forums, public datasets (e.g., Project Gutenberg).
  2. Preprocess dataset: Data Preprocessing
    1. Tokenization: Split text into tokens (words, subwords, or characters).
    2. Normalization: Lowercase text, remove special characters, handle contractions, etc.
    3. Filtering: Remove non-text content, duplicates, and overly long or short texts.
    4. Encoding: Convert tokens to numerical representations using a tokenizer.
  3. Choose architecture for your LLM. Transformer-based models (e.g., GPT, BERT), Parameters: Define model size (number of layers, heads, hidden units).
  4. Training
  5. Evaludation

from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Define configuration

config = GPT2Config(

vocab_size=50257,

n_positions=1024,

n_ctx=1024,

n_embd=768,

n_layer=12,

n_head=12,

n_inner=3072,

activation_function='gelu',

resid_pdrop=0.1,

embd_pdrop=0.1,

attn_pdrop=0.1,

layer_norm_epsilon=1e-5,

initializer_range=0.02,

) # Initialize tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare dataset

def load_dataset(file_path, tokenizer):

return TextDataset(

tokenizer=tokenizer,

file_path=file_path,

block_size=128,

)

train_dataset = load_dataset("path/to/train.txt", tokenizer)

val_dataset = load_dataset("path/to/val.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(

tokenizer=tokenizer,

mlm=False,

) # Initialize model

model = GPT2LMHeadModel(config)

model.resize_token_embeddings(len(tokenizer))

# Set training arguments

training_args = TrainingArguments(

output_dir="./results",

overwrite_output_dir=True,

num_train_epochs=3,

per_device_train_batch_size=2,

per_device_eval_batch_size=2,

save_steps=10_000,

save_total_limit=2,

prediction_loss_only=True,

logging_dir='./logs',

) # Create trainer and train

trainer = Trainer(

model=model,

args=training_args,

data_collator=data_collator,

train_dataset=train_dataset,

eval_dataset=val_dataset,

)

trainer.train()

# Save the model

model.save_pretrained("./trained_model")

tokenizer.save_pretrained("./trained_model")

another:

import torch

import torch.nn as nn

import torch.optim as optim

from transformers import PreTrainedModel, PretrainedConfig

from datasets import load_dataset

from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# Define the model architecture

class SimpleLM(PreTrainedModel):

def __init__(self, config):

super().__init__(config)

self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)

self.transformer = nn.TransformerEncoder(

nn.TransformerEncoderLayer(config.hidden_size, config.num_heads),

config.num_layers

)

self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)

def forward(self, input_ids):

x = self.embedding(input_ids)

x = self.transformer(x)

return self.lm_head(x)

# Create a custom configuration

class SimpleLMConfig(PretrainedConfig):

model_type = "simple_lm"

def __init__(

self,

vocab_size=30000,

hidden_size=256,

num_layers=6,

num_heads=8,

**kwargs

):

super().__init__(**kwargs)

self.vocab_size = vocab_size

self.hidden_size = hidden_size

self.num_layers = num_layers

self.num_heads = num_heads

# Train tokenizer

def train_tokenizer(texts):

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

tokenizer.train_from_iterator(texts, trainer)

return tokenizer

# Load and preprocess data

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

texts = dataset["text"]

# Train tokenizer

tokenizer = train_tokenizer(texts)

# Tokenize dataset

def tokenize_function(examples):

return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Initialize model and optimizer

config = SimpleLMConfig()

model = SimpleLM(config)

optimizer = optim.Adam(model.parameters())

# Training loop (simplified)

for epoch in range(10): # Adjust number of epochs as needed

for batch in tokenized_dataset.iter(batch_size=32):

optimizer.zero_grad()

outputs = model(torch.tensor(batch["input_ids"]))

loss = nn.functional.cross_entropy(outputs.view(-1, config.vocab_size), torch.tensor(batch["input_ids"]).view(-1))

loss.backward()

optimizer.step()

print(f"Epoch {epoch+1} completed")

# Save model and tokenizer in Hugging Face format

model.save_pretrained("./simple_lm")

tokenizer.save("./simple_lm/tokenizer.json")

  

📝 📜 ⏱️ ⬆️