Caching SearXNG Version 2- Immortality Knowledge Base

Caching SearXNG Version 2

This revision is from 2024/07/30 23:52. You can Restore it.

In version 2 the caching filenames are hashed which means that international characters and symbols are supported and less likely to break searx.

webapp.py in /usr/local/searxng/searxng-src/searx/webapp.py : def search()
__init__.py in /usr/local/searxng/searxng-src/searx/search/__init__.py : class Search


sudo mkdir -p /usr/local/searxng/searxng-src/searx/cache
for i in {0..255}; do sudo mkdir -p /usr/local/searxng/searxng-src/searx/cache/$(printf "%02x" $i); done

sudo chown -R searxng:searxng /usr/local/searxng/searxng-src/searx/cache

sudo chmod -R 755 /usr/local/searxng/searxng-src/searx/cache


import hashlib
import os

        fname = request.form['q'] + str(search_query.pageno) + str(search_query.categories[0])

        # Generate a hash of the search term
        hash_object = hashlib.md5(fname.encode())
        hex_dig = hash_object.hexdigest()
        subdirectory = hex_dig[:2]  # Use the first 2 characters of the hash as the subdirectory name
        cache_dir = os.path.abspath(os.path.join("cache", subdirectory))

        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        
        file_path = os.path.join(cache_dir, hex_dig)  # Use the hash as the filename

        if not os.path.exists(file_path):
            responsex = webutils.get_json_response(search_query, result_container)
            if len(responsex.strip()) > 1000:  # Checking length greater than 2 to ensure it's not just '{}'
                with open(file_path, "w") as text_file:
                    text_file.write(responsex)  # json.dump(responsex, text_file)


import hashlib
import os

    def search_standard(self):
        requests, self.actual_timeout = self._get_requests()

        cache_dir = 'cache'
        fname = self.search_query.query.lower() + str(self.search_query.pageno) + str(self.search_query.categories[0])
        hash_object = hashlib.md5(fname.encode())
        hex_dig = hash_object.hexdigest()
        subdirectory = hex_dig[:2]
        hashed_filename = hex_dig  # Use the full hash as the filename
        query_dir = os.path.join(cache_dir, subdirectory)
        mock_data_filename = os.path.join(query_dir, hashed_filename)
    
        if requests:
            if os.path.isfile(mock_data_filename):
                self.search_multiple_requests2(requests, hashed_filename)
            else:
                self.search_multiple_requests(requests)

        return True


    def search_multiple_requests2(self, requests, hashed_filename):
        search_id = str(uuid4())
        mock_result_container = ResultContainer()
        cache_dir = 'cache'
        fname = self.search_query.query.lower() + str(self.search_query.pageno) + str(self.search_query.categories[0])
        hash_object = hashlib.md5(fname.encode())
        hex_dig = hash_object.hexdigest()
        subdirectory = hex_dig[:2]
        query_dir = os.path.join(cache_dir, subdirectory)
        mock_data_filename = os.path.join(query_dir, hashed_filename)

        with open(mock_data_filename, encoding='utf-8') as mock_data_file:
            mock_data = json.load(mock_data_file)
            mock_results = mock_data['results']
            threads = []
            for engine_name, _, _ in requests:
                th = threading.Thread(
                    target=self.mock_search_function,
                    args=(engine_name, mock_results, mock_result_container),
                    name=search_id,
                )
                th._timeout = False
                th._engine_name = engine_name
                th.start()
                threads.append(th)
        
            remaining_time = None
            for th in threads:
                if th.name == search_id:
                    if remaining_time is None:
                        remaining_time = self.actual_timeout - (default_timer() - self.start_time)
                    th.join(remaining_time)
                    if th.is_alive():
                        th._timeout = True
                        self.result_container.add_unresponsive_engine(th._engine_name, 'timeout')
                        PROCESSORS[th._engine_name].logger.error('engine timeout')
        
            for th in threads:
                th.join()
    
        self.result_container = mock_result_container

    def mock_search_function(self, engine_name, mock_results, result_container):
        time.sleep(0.1)
        for result in mock_results:
            if 'publishedDate' in result:
                if isinstance(result['publishedDate'], str):
                    result['publishedDate'] = datetime.fromisoformat(result['publishedDate'])
        result_container.extend(engine_name, mock_results)

📝 📜 ⏱️ ⬆️