SearX Robot

This revision is from 2024/08/07 17:43. You can Restore it.

Command:

wget -qO- "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"

curl "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"

Build the cache from the keyword dataset

import json

import subprocess

# Path to dataset JSON file

json_file_path = 'path_to_your_json_file.json'

# Read the JSON file

with open(json_file_path, 'r') as file:

data = json.load(file)

# Iterate through the JSON data to extract keywords

for item in data:

keyword = item['keyphrase']

# Construct the wget command

wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'

# Execute the wget command

subprocess.run(wget_command, shell=True)

Keyword Datasets

  1. https://www.kaggle.com/datasets/hofesiy/2019-search-engine-keywords

Running this list produces new keyword suggestions right from searx, extract suggestions...

# Scan the cache and grab all the keywords

import json

from pathlib import Path

def process_searxng_cache(cache_dir, output_file):

# Create a set to store unique processed entries

entries = set()

# Walk through all directories and files in the cache directory

cache_path = Path(cache_dir)

# Debug: Check if the cache directory exists

if not cache_path.exists():

print(f"Cache directory {cache_path} does not exist.")

return

# Debug: Print the cache directory path

print(f"Cache directory: {cache_path}")

for subdir in cache_path.iterdir():

if subdir.is_dir():

print(f"Processing subdirectory: {subdir}") # Debug: Print each subdirectory being processed

for file in subdir.iterdir():

if file.is_file():

# Attempt to open and read each file as JSON

print(f"Found file: {file}") # Debug: Print each file being processed

try:

with file.open('r', encoding='utf-8') as f:

data = json.load(f)

# Print the JSON data to debug

print(f"Processing file: {file}")

print(f"JSON data: {data}")

# Check if the required keys are in the JSON data

if 'query' in data and 'suggestions' in data:

query = data['query']

suggestions = data['suggestions']

print(f"Query: {query}, Suggestions: {suggestions}") # Debug output

for suggestion in suggestions:

entries.add(f"{query}: {suggestion}")

else:

print(f"Missing 'query' or 'suggestions' in file {file}")

except (json.JSONDecodeError, KeyError, IOError) as e:

print(f"Error processing file {file}: {e}")

# Write the entries to the output file

with Path(output_file).open('w', encoding='utf-8') as out_f:

for entry in sorted(entries):

out_f.write(f"{entry}\n")

if __name__ == "__main__":

cache_dir = "/usr/local/searxng/searxng-src/searx/cache/"

output_file = "keywords.txt"

process_searxng_cache(cache_dir, output_file)

print(f"Processed entries have been saved to {output_file}")

Use the suggestions to crawl more

import subprocess

# Path to your text file

text_file_path = 'path_to_your_text_file.txt'

# Read the text file

with open(text_file_path, 'r') as file:

lines = file.readlines()

# Iterate through the lines to extract keywords

for line in lines:

keyword = line.strip() # Remove any leading/trailing whitespace

# Construct the wget command

wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'

# Execute the wget command

subprocess.run(wget_command, shell=True)

  

📝 📜 ⏱️ ⬆️