Spike GPT- Immortality Knowledge Base

Spike GPT

This revision is from 2024/10/20 20:50. You can Restore it.

git clone https://github.com/ridgerchu/SpikeGPT.git

git clone https://huggingface.co/ridger/SpikeGPT-OpenWebText-216M

python3 -m venv spike_env

source ./spike_env/bin/activate

pip install -r requirements.txt

Open run.py and change CUDA to CPU if you do not have CUDA.

Open run.py and replae mode name with MODEL_NAME = 'SpikeGPT-OpenWebText-216M/SpikeGPT-216M'


nano /home/x/SpikeGPT/src/utils.py
# Replace probs = F.softmax(torch.tensor(out), dim=-1)
probs = F.softmax(out.clone().detach(), dim=-1)

Error: SpikeGPT/src/model_run.py:42: FutureWarning: You are using `torch.load` with `weights_only=False


 w = torch.load(args.MODEL_NAME + '.pth', map_location='cpu', weights_only=True)

python3 run.py


########################################################################################################
# Run with python3 run.py
# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
########################################################################################################

import numpy as np
import math, os, sys, types, time, gc
import torch
from src.utils import TOKENIZER
import matplotlib.ticker as ticker
try:
    os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1]
except:
    pass
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
np.set_printoptions(precision=4, suppress=True, linewidth=200)
args = types.SimpleNamespace()

########################################################################################################
# Step 1: set model & config (use v4 to run your trained-from-scratch models. v4 and v4neo are compatible)
########################################################################################################

args.RUN_DEVICE = "cpu" # 'cuda' // 'cpu' (already fast)
args.FLOAT_MODE = "fp32" # fp16 (good for GPU, does not work for CPU) // fp32 (good for CPU) // bf16 (less accurate, but works for CPU)

# if args.RUN_DEVICE == "cuda":
#     os.environ["RWKV_RUN_BACKEND"] = 'nvfuser' # !!!BUGGY!!! wrong output
os.environ["RWKV_JIT_ON"] = '1' # '1' or '0'. very useful for GPU/CPU fp32, but might be harmful for GPU fp16. please benchmark !!!

#For BookCorpus Pre-trained model
# TOKEN_MODE = "char"
# WORD_NAME = "vocab_book"
# UNKNOWN_CHAR = ' '
# vocab_size = 77

#For 216M OpenWebText Pre-trained model
TOKEN_MODE = "pile"
WORD_NAME = [
    "20B_tokenizer.json",
    "20B_tokenizer.json",
]  # [vocab, vocab] for Pile model
UNKNOWN_CHAR = None
vocab_size = 50277

MODEL_NAME = 'SpikeGPT-OpenWebText-216M/SpikeGPT-216M'
n_layer = 18
n_embd = 768
ctx_len = 1024

args.MODEL_NAME = MODEL_NAME
args.n_layer = n_layer
args.n_embd = n_embd
args.ctx_len = ctx_len
args.vocab_size = vocab_size
args.head_qk = 0
args.pre_ffn = 0
args.grad_cp = 0
args.my_pos_emb = 0
os.environ["RWKV_RUN_DEVICE"] = args.RUN_DEVICE

########################################################################################################
# Step 2: set prompt & sampling stuffs
########################################################################################################

context = ''

NUM_TRIALS = 1
LENGTH_PER_TRIAL = 333

TEMPERATURE = 1.5
top_p = 0.7
top_p_newline = 0.9  # only used in TOKEN_MODE = char

DEBUG_DEBUG = False  # True False → show softmax output

########################################################################################################

print(f'\nUsing {args.RUN_DEVICE.upper()}. Loading {MODEL_NAME}...')
from src.model_run import RWKV_RNN

model = RWKV_RNN(args)

print(f'\nOptimizing speed...')
#out, _ = model.forward([187], None, None, None)
# print(out)
gc.collect()
torch.cuda.empty_cache()

print(f'\nLoading tokenizer {WORD_NAME}...')
tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
if TOKEN_MODE == "pile":
    assert tokenizer.tokenizer.decode([187]) == '\n'

########################################################################################################

def generate_response(context, model, tokenizer, ctx_len, temperature, top_p, top_p_newline, debug_debug):
    if tokenizer.charMode:
        context = tokenizer.refine_context(context)
        ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
    else:
        ctx = tokenizer.tokenizer.encode(context)
    src_len = len(ctx)
    src_ctx = ctx.copy()

    init_state = None
    init_out = None
    state = None
    mem1 = None
    mem2 = None
    out = None

    for TRIAL in range(1 if debug_debug else NUM_TRIALS):
        ctx = src_ctx.copy()

        if TRIAL == 0:
            for i in range(src_len):
                x = ctx[: i + 1]
                if i == src_len - 1:
                    init_out, init_state, mem1, mem2 = model.forward(x, init_state, mem1, mem2)
                else:
                    init_state, mem1, mem2 = model.forward(x, init_state, mem1, mem2, preprocess_only=True)
            gc.collect()
            torch.cuda.empty_cache()

        out_last = src_len
        for i in range(src_len, src_len + (1 if debug_debug else LENGTH_PER_TRIAL)):
            x = ctx[: i + 1]
            x = x[-ctx_len:]

            if i == src_len:
                out = init_out.clone()
                state = init_state.clone()
            else:
                out, state, mem1, mem2 = model.forward(x, state, mem1, mem2)
            if debug_debug:
                print("model", np.array(x), "==>", np.array(out), np.max(out.cpu().numpy()), np.min(out.cpu().numpy()))
            if TOKEN_MODE == "pile":
                out[0] = -999999999  # disable <|endoftext|>

            ttt = tokenizer.sample_logits(
                out,
                x,
                ctx_len,
                temperature=temperature,
                top_p_usual=top_p,
                top_p_newline=top_p_newline,
            )
            ttt = int(ttt)
            ctx += [ttt]

            if tokenizer.charMode:
                char = tokenizer.itos[ttt]
                print(char, end="", flush=True)
            else:
                char = tokenizer.tokenizer.decode(ctx[out_last:])
                if '\ufffd' not in char: # is valid utf8 string?
                    print(char, end="", flush=True)
                    out_last = i+1

        print("\n")

print("\nInteractive inference mode. Type 'exit' to quit.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break
    context += f"You: {user_input}\nBot: "
    generate_response(context, model, tokenizer, ctx_len, TEMPERATURE, top_p, top_p_newline, DEBUG_DEBUG)
    context += "\n"

print(("-" * 50) + '\n')

📝 📜 ⏱️ ⬆️