Build an LLM from scratch- Immortality Knowledge Base

Build an LLM from scratch

This revision is from 2024/07/01 20:48. You can Restore it.

Step 1: Plan and design the LLM

Standard model design
LLM has the ability to re-train itself, to hit the re-train button. (no human required)
LLM is constantly re-working its training data to improve its training data. (no human required)
LLM re-works its training code to produce a better model.

Note: focus on the LLM's ability to distinguish differences correctly, better from worse, yes from no and so on, successful compile vs errors, red from blue.

Demonstrator must be resource light enough for the LLM to perform these tasks.

Step 2: Eval Space

The tools that give the LLM the ability to test, proof and rework training data. For instance, a code compiler or a training data compiler. A versus B thinking.

Make the LLM

Get the training datasets: Sources: Common Crawl, Wikipedia, books, articles, forums, public datasets (e.g., Project Gutenberg).
Preprocess dataset: Data Preprocessing

Tokenization: Split text into tokens (words, subwords, or characters).
Normalization: Lowercase text, remove special characters, handle contractions, etc.
Filtering: Remove non-text content, duplicates, and overly long or short texts.
Encoding: Convert tokens to numerical representations using a tokenizer.

Choose architecture for your LLM. Transformer-based models (e.g., GPT, BERT), Parameters: Define model size (number of layers, heads, hidden units).
Training
Evaludation

Use GPT2 tools:


from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Define configuration
config = GPT2Config(
    vocab_size=50257,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=3072,
    activation_function='gelu',
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
)

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare dataset
def load_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128,
    )

train_dataset = load_dataset("path/to/train.txt", tokenizer)
val_dataset = load_dataset("path/to/val.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize model
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

# Create trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

# Save the model
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

Another, from scratch:

import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig, Trainer, TrainingArguments
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import PreTrainedTokenizerFast

# Define the model architecture
class NewLM(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=config.hidden_size,
                nhead=config.num_heads,
                dim_feedforward=config.intermediate_size,
                dropout=config.hidden_dropout_prob
            ),
            num_layers=config.num_hidden_layers
        )
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
        
    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        if attention_mask is not None:
            x = x.permute(1, 0, 2)  # TransformerEncoder expects seq_len first
            x = self.transformer(x, src_key_padding_mask=attention_mask)
            x = x.permute(1, 0, 2)  # Change back to batch first
        else:
            x = x.permute(1, 0, 2)
            x = self.transformer(x)
            x = x.permute(1, 0, 2)
        return self.lm_head(x)

# Create a custom configuration
class NewLMConfig(PretrainedConfig):
    model_type = "new_lm"
    def __init__(
        self,
        vocab_size=30000,
        hidden_size=256,
        num_hidden_layers=6,
        num_heads=8,
        intermediate_size=1024,
        hidden_dropout_prob=0.1,
        max_position_embeddings=512,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_heads = num_heads
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.max_position_embeddings = max_position_embeddings

# Train tokenizer
def train_tokenizer(texts):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train_from_iterator(texts, trainer)
    return PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# Load and preprocess data
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
texts = dataset["text"]

# Train tokenizer
tokenizer = train_tokenizer(texts)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Initialize model
config = NewLMConfig(vocab_size=len(tokenizer))
model = NewLM(config)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

# Define data collator
def data_collator(features):
    return {
        "input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
        "attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in features]),
        "labels": torch.stack([torch.tensor(f["input_ids"]) for f in features]),
    }

# Create trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./new_lm")
tokenizer.save_pretrained("./new_lm")

Rework Training Data

Load the data.
Initialize the LLM.
Create a loop to process the data.
In each iteration, select a random piece of data.
Use the LLM to generate a new version of the data.
Replace the original data with the generated data.
Repeat until all data has been processed.



import random
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load your data
data = ["example sentence 1", "example sentence 2", ...]

# Initialize the LLM (BERT)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)

# Loop through the data
for i in range(len(data)):
    # Select a random piece of data
    idx = random.randint(0, len(data) - 1)
    input_text = data[idx]

    # Tokenize the input text and mask a random word
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    masked_index = random.choice([i for i, token in enumerate(inputs["input_ids"][0]) if token.item() != tokenizer.pad_token_id])
    inputs["input_ids"][0][masked_index] = tokenizer.mask_token_id

    # Generate a new version of the data
    outputs = model(**inputs)
    predictions = outputs.logits
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    new_text = input_text[:masked_index] + predicted_token + input_text[masked_index + 1:]

    # Replace the original data with the generated data
    data[idx] = new_text

# Print the modified data
print(data)

When complete retrain the model and repeat. After the loop extract, run the model training script. Turn the model loading into a function and reload the new model and repeat endlessly. Throw new training data in the directory it uses or have two directories, an orig and rework directory.

Provide the model more tools and means to better rw-work its training data or even synthesize new data.

A simulation space whch mimicks real world physics could be a universal space for performing evaludation, as the direction of the models is self-administered by the LLM, throwing them in an NPC space such project as https://github.com/AkshitIreddy/Interactive-LLM-Powered-NPCs

The simulation space based on real world physics and threads from the real world is designed to ground the evaluation and spur synthetic data creation.

The focus shifts from the model to the trainer.

📝 📜 ⏱️ ⬆️