Spike GPT

This revision is from 2024/10/20 20:20. You can Restore it.

python3 -m venv spike_env

source ./spike_env/bin/activate

pip install -r requirements.txt

Open run.py and change CUDA to CPU if you do not have CUDA.

nano /home/x/SpikeGPT/src/utils.py

# Replace torch.tensor(out) with out.clone().detach()

python3 run.py

######################################################################################################## # Run with python3 run.py # The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM ########################################################################################################

import numpy as np

import math, os, sys, types, time, gc

import torch

from src.utils import TOKENIZER

import matplotlib.ticker as ticker

try:

os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1]

except:

pass

torch.backends.cudnn.benchmark = True

torch.backends.cudnn.allow_tf32 = True

torch.backends.cuda.matmul.allow_tf32 = True

np.set_printoptions(precision=4, suppress=True, linewidth=200)

args = types.SimpleNamespace()

######################################################################################################## # Step 1: set model & config (use v4 to run your trained-from-scratch models. v4 and v4neo are compatible) ########################################################################################################

args.RUN_DEVICE = "cpu" # 'cuda' // 'cpu' (already fast)

args.FLOAT_MODE = "fp32" # fp16 (good for GPU, does not work for CPU) // fp32 (good for CPU) // bf16 (less accurate, but works for CPU)

# if args.RUN_DEVICE == "cuda": # os.environ["RWKV_RUN_BACKEND"] = 'nvfuser' # !!!BUGGY!!! wrong output

os.environ["RWKV_JIT_ON"] = '1' # '1' or '0'. very useful for GPU/CPU fp32, but might be harmful for GPU fp16. please benchmark !!!

#For BookCorpus Pre-trained model # TOKEN_MODE = "char" # WORD_NAME = "vocab_book" # UNKNOWN_CHAR = ' ' # vocab_size = 77 #For 216M OpenWebText Pre-trained model

TOKEN_MODE = "pile"

WORD_NAME = [

"20B_tokenizer.json",

"20B_tokenizer.json",

] # [vocab, vocab] for Pile model

UNKNOWN_CHAR = None

vocab_size = 50277

MODEL_NAME = 'SpikeGPT-OpenWebText-216M/SpikeGPT-216M'

n_layer = 18

n_embd = 768

ctx_len = 1024

args.MODEL_NAME = MODEL_NAME

args.n_layer = n_layer

args.n_embd = n_embd

args.ctx_len = ctx_len

args.vocab_size = vocab_size

args.head_qk = 0

args.pre_ffn = 0

args.grad_cp = 0

args.my_pos_emb = 0

os.environ["RWKV_RUN_DEVICE"] = args.RUN_DEVICE

######################################################################################################## # Step 2: set prompt & sampling stuffs ########################################################################################################

context = ''

NUM_TRIALS = 1

LENGTH_PER_TRIAL = 333

TEMPERATURE = 1.5

top_p = 0.7

top_p_newline = 0.9 # only used in TOKEN_MODE = char

DEBUG_DEBUG = False # True False → show softmax output

########################################################################################################

print(f'\nUsing {args.RUN_DEVICE.upper()}. Loading {MODEL_NAME}...')

from src.model_run import RWKV_RNN

model = RWKV_RNN(args)

print(f'\nOptimizing speed...')

#out, _ = model.forward([187], None, None, None) # print(out)

gc.collect()

torch.cuda.empty_cache()

print(f'\nLoading tokenizer {WORD_NAME}...')

tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)

if TOKEN_MODE == "pile":

assert tokenizer.tokenizer.decode([187]) == '\n'

########################################################################################################

def generate_response(context, model, tokenizer, ctx_len, temperature, top_p, top_p_newline, debug_debug):

if tokenizer.charMode:

context = tokenizer.refine_context(context)

ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]

else:

ctx = tokenizer.tokenizer.encode(context)

src_len = len(ctx)

src_ctx = ctx.copy()

init_state = None

init_out = None

state = None

mem1 = None

mem2 = None

out = None

for TRIAL in range(1 if debug_debug else NUM_TRIALS):

ctx = src_ctx.copy()

if TRIAL == 0:

for i in range(src_len):

x = ctx[: i + 1]

if i == src_len - 1:

init_out, init_state, mem1, mem2 = model.forward(x, init_state, mem1, mem2)

else:

init_state, mem1, mem2 = model.forward(x, init_state, mem1, mem2, preprocess_only=True)

gc.collect()

torch.cuda.empty_cache()

out_last = src_len

for i in range(src_len, src_len + (1 if debug_debug else LENGTH_PER_TRIAL)):

x = ctx[: i + 1]

x = x[-ctx_len:]

if i == src_len:

out = init_out.clone()

state = init_state.clone()

else:

out, state, mem1, mem2 = model.forward(x, state, mem1, mem2)

if debug_debug:

print("model", np.array(x), "==>", np.array(out), np.max(out.cpu().numpy()), np.min(out.cpu().numpy()))

if TOKEN_MODE == "pile":

out[0] = -999999999 # disable <|endoftext|>

ttt = tokenizer.sample_logits(

out,

x,

ctx_len,

temperature=temperature,

top_p_usual=top_p,

top_p_newline=top_p_newline,

)

ttt = int(ttt)

ctx += [ttt]

if tokenizer.charMode:

char = tokenizer.itos[ttt]

print(char, end="", flush=True)

else:

char = tokenizer.tokenizer.decode(ctx[out_last:])

if '\ufffd' not in char: # is valid utf8 string?

print(char, end="", flush=True)

out_last = i+1

print("\n")

print("\nInteractive inference mode. Type 'exit' to quit.\n")

while True:

user_input = input("You: ")

if user_input.lower() == 'exit':

break

context += f"You: {user_input}\nBot: "

generate_response(context, model, tokenizer, ctx_len, TEMPERATURE, top_p, top_p_newline, DEBUG_DEBUG)

context += "\n"

print(("-" * 50) + '\n')

  

📝 📜 ⏱️ ⬆️