SearX Robot- Immortality Knowledge Base

SearX Robot

Command:


wget -qO- "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"


curl "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"

Build the cache from the keyword dataset


import json
import subprocess

# Path to dataset JSON file
json_file_path = 'path_to_your_json_file.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Iterate through the JSON data to extract keywords
for item in data:
    keyword = item['keyphrase']
    
    # Construct the wget command
    wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'
    
    # Execute the wget command
    subprocess.run(wget_command, shell=True)

    # Add a 1 second delay
    time.sleep ( 1 )

Keyword Datasets

https://www.kaggle.com/datasets/hofesiy/2019-search-engine-keywords

Running this list produces new keyword suggestions right from searx, extract suggestions...


# Scan the cache and grab all the keywords
import json
from pathlib import Path

def process_searxng_cache(cache_dir, output_file):
    # Create a set to store unique processed entries
    entries = set()
    
    # Walk through all directories and files in the cache directory
    cache_path = Path(cache_dir)
    
    # Debug: Check if the cache directory exists
    if not cache_path.exists():
        print(f"Cache directory {cache_path} does not exist.")
        return
    
    # Debug: Print the cache directory path
    print(f"Cache directory: {cache_path}")
    
    for subdir in cache_path.iterdir():
        if subdir.is_dir():
            print(f"Processing subdirectory: {subdir}")  # Debug: Print each subdirectory being processed
            for file in subdir.iterdir():
                if file.is_file():
                    # Attempt to open and read each file as JSON
                    print(f"Found file: {file}")  # Debug: Print each file being processed
                    try:
                        with file.open('r', encoding='utf-8') as f:
                            data = json.load(f)
                            # Print the JSON data to debug
                            print(f"Processing file: {file}")
                            print(f"JSON data: {data}")
                            
                            # Check if the required keys are in the JSON data
                            if 'query' in data and 'suggestions' in data:
                                query = data['query']
                                suggestions = data['suggestions']
                                print(f"Query: {query}, Suggestions: {suggestions}")  # Debug output
                                for suggestion in suggestions:
                                    entries.add(f"{query}: {suggestion}")
                            else:
                                print(f"Missing 'query' or 'suggestions' in file {file}")
                    except (json.JSONDecodeError, KeyError, IOError) as e:
                        print(f"Error processing file {file}: {e}")

    # Write the entries to the output file
    with Path(output_file).open('w', encoding='utf-8') as out_f:
        for entry in sorted(entries):
            out_f.write(f"{entry}\n")

if __name__ == "__main__":
    cache_dir = "/usr/local/searxng/searxng-src/searx/cache/"
    output_file = "keywords.txt"
    process_searxng_cache(cache_dir, output_file)
    print(f"Processed entries have been saved to {output_file}")

Use the suggestions to crawl more


import subprocess

# Path to your text file
text_file_path = 'path_to_your_text_file.txt'

# Read the text file
with open(text_file_path, 'r') as file:
    lines = file.readlines()

# Iterate through the lines to extract keywords
for line in lines:
    keyword = line.strip()  # Remove any leading/trailing whitespace
    
    # Construct the wget command
    wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'
    
    # Execute the wget command
    subprocess.run(wget_command, shell=True)

    # Add a 1 second delay
    time.sleep ( 1 )

📝 📜 ⏱️ ⬆️