Run LLM from Linux Command Line

This revision is from 2024/07/16 20:52. You can Restore it.

The .gguf format is typically used with the llama.cpp project and its Python bindings, not with the Hugging Face Transformers library. To use a .gguf file, you need to use a different library, such as llama-cpp-python.

python3 -m pip install --upgrade pip

pip install transformers

pip install torch

pip install llama-cpp-python

Python script:

import os

from llama_cpp import Llama

model_path = "/home/x/Downloads/Lexi-Llama-3-8B-Uncensored_Q8_0.gguf"

# Load the model

llm = Llama(model_path=model_path)

# Initialize conversation history

conversation = []

print("Welcome! Type 'exit' to end the conversation.")

while True:

# Get user input

user_input = input("You: ").strip()

# Check if user wants to exit

if user_input.lower() == 'exit':

print("Goodbye!")

break

# Add user input to conversation history

conversation.append(f"Human: {user_input}")

# Construct the prompt with conversation history

prompt = "\n".join(conversation) + "\nAI:"

# Generate a response

response = llm(prompt, max_tokens=200, stop=["Human:", "\n"])

# Extract and print the response

ai_response = response['choices'][0]['text'].strip()

print("AI:", ai_response)

# Add AI response to conversation history

conversation.append(f"AI: {ai_response}")

Execute:

python llm_script.py

To load a hugging face transformer model directly...

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "internlm/internlm2-chat-7b" # This is the Hugging Face model ID

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to GPU if available

device = "cuda" if torch.cuda.is_available() else "cpu"

model = model.to(device)

# Initialize conversation history

conversation = []

print("Welcome! Type 'exit' to end the conversation.")

while True:

# Get user input

user_input = input("You: ").strip()

# Check if user wants to exit

if user_input.lower() == 'exit':

print("Goodbye!")

break

# Add user input to conversation history

conversation.append(f"Human: {user_input}")

# Construct the prompt with conversation history

prompt = "\n".join(conversation) + "\nAI:"

# Tokenize input

inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate response

with torch.no_grad():

outputs = model.generate(

**inputs,

max_new_tokens=200,

num_return_sequences=1,

do_sample=True,

temperature=0.7,

top_p=0.95,

no_repeat_ngram_size=3,

pad_token_id=tokenizer.eos_token_id

)

# Decode and print the response

response = tokenizer.decode(outputs[0], skip_special_tokens=True)

ai_response = response[len(prompt):].strip()

print("AI:", ai_response)

# Add AI response to conversation history

conversation.append(f"AI: {ai_response}")

  

📝 📜 ⏱️ ⬆️