SearX Robot
This revision is from 2024/08/07 17:43. You can Restore it.
Command:
wget -qO- "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"
curl "http://localhost/searxng/search?q=test&category_general=&language=auto&time_range=&safesearch=0&theme=simple"
Build the cache from the keyword dataset
import json
import subprocess
# Path to dataset JSON filejson_file_path = 'path_to_your_json_file.json'
# Read the JSON filewith open(json_file_path, 'r') as file:
data = json.load(file)
# Iterate through the JSON data to extract keywordsfor item in data:
keyword = item['keyphrase']
# Construct the wget command
wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'
# Execute the wget command
subprocess.run(wget_command, shell=True)
Keyword Datasets
Running this list produces new keyword suggestions right from searx, extract suggestions...
# Scan the cache and grab all the keywordsimport json
from pathlib import Path
def process_searxng_cache(cache_dir, output_file):
# Create a set to store unique processed entries
entries = set()
# Walk through all directories and files in the cache directory
cache_path = Path(cache_dir)
# Debug: Check if the cache directory exists
if not cache_path.exists():
print(f"Cache directory {cache_path} does not exist.")
return
# Debug: Print the cache directory path
print(f"Cache directory: {cache_path}")
for subdir in cache_path.iterdir():
if subdir.is_dir():
print(f"Processing subdirectory: {subdir}") # Debug: Print each subdirectory being processed
for file in subdir.iterdir():
if file.is_file():
# Attempt to open and read each file as JSON
print(f"Found file: {file}") # Debug: Print each file being processed
try:
with file.open('r', encoding='utf-8') as f:
data = json.load(f)
# Print the JSON data to debug
print(f"Processing file: {file}")
print(f"JSON data: {data}")
# Check if the required keys are in the JSON data
if 'query' in data and 'suggestions' in data:
query = data['query']
suggestions = data['suggestions']
print(f"Query: {query}, Suggestions: {suggestions}") # Debug output
for suggestion in suggestions:
entries.add(f"{query}: {suggestion}")
else:
print(f"Missing 'query' or 'suggestions' in file {file}")
except (json.JSONDecodeError, KeyError, IOError) as e:
print(f"Error processing file {file}: {e}")
# Write the entries to the output file
with Path(output_file).open('w', encoding='utf-8') as out_f:
for entry in sorted(entries):
out_f.write(f"{entry}\n")
if __name__ == "__main__":
cache_dir = "/usr/local/searxng/searxng-src/searx/cache/"
output_file = "keywords.txt"
process_searxng_cache(cache_dir, output_file)
print(f"Processed entries have been saved to {output_file}")
Use the suggestions to crawl more
import subprocess
# Path to your text filetext_file_path = 'path_to_your_text_file.txt'
# Read the text filewith open(text_file_path, 'r') as file:
lines = file.readlines()
# Iterate through the lines to extract keywordsfor line in lines:
keyword = line.strip() # Remove any leading/trailing whitespace
# Construct the wget command
wget_command = f'wget -qO- "http://localhost/searxng/search?q={keyword}&category_general=&language=auto&time_range=&safesearch=0&theme=simple"'
# Execute the wget command
subprocess.run(wget_command, shell=True)