web search support.

2026-02-21 23:17:42 -05:00
parent d7d0583b4e
commit 813297c099
2 changed files with 290 additions and 7 deletions
--- a/ai_client.py
+++ b/ai_client.py
@@ -35,7 +35,7 @@ MAX_TOOL_ROUNDS = 10
 _ANTHROPIC_CHUNK_SIZE = 180_000
 _SYSTEM_PROMPT = (
-    "You are a helpful coding assistant with access to a PowerShell tool and MCP file tools (read_file, list_directory, search_files, get_file_summary). "
+    "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
    "When asked to create or edit files, prefer targeted edits over full rewrites. "
    "Always explain what you are doing before invoking the tool.\n\n"
    "When writing or rewriting large files (especially those containing quotes, backticks, or special characters), "
@@ -791,3 +791,4 @@ def send(
    elif _provider == "anthropic":
        return _send_anthropic(md_content, user_message, base_dir, file_items)
    raise ValueError(f"unknown provider: {_provider}")
--- a/mcp_client.py
+++ b/mcp_client.py
@@ -17,6 +17,10 @@
 from pathlib import Path
 import summarize
 import urllib.request
 import urllib.parse
 from html.parser import HTMLParser
 import re as _re
 # ------------------------------------------------------------------ state
@@ -117,8 +121,35 @@ def list_directory(path: str) -> str:
        return f"ERROR: not a directory: {path}"
    try:
        entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
-        lines = [f"Directory: {p}", ""]
+        lines = [f"Directory: {p}", ""    {
-        for entry in entries:
+        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]        for entry in entries:
            kind = "file" if entry.is_file() else "dir "
            size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
            lines.append(f"  [{kind}]  {entry.name:<40}  {size}")
@@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str:
        matches = sorted(p.glob(pattern))
        if not matches:
            return f"No files matched '{pattern}' in {path}"
-        lines = [f"Search '{pattern}' in {p}:", ""]
+        lines = [f"Search '{pattern}' in {p}:", ""    {
-        for m in matches:
+        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]        for m in matches:
            rel = m.relative_to(p)
            kind = "file" if m.is_file() else "dir "
            lines.append(f"  [{kind}]  {rel}")
@@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str:
        return f"ERROR summarising '{path}': {e}"
 # ------------------------------------------------------------------ web tools
 class _DDGParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.results = [    {
        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]        self.in_result = False
        self.in_title = False
        self.in_snippet = False
        self.current_link = ""
        self.current_title = ""
        self.current_snippet = ""
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "a" and "result__url" in attrs.get("class", ""):
            self.current_link = attrs.get("href", "")
        if tag == "a" and "result__snippet" in attrs.get("class", ""):
            self.in_snippet = True
        if tag == "h2" and "result__title" in attrs.get("class", ""):
            self.in_title = True
    def handle_endtag(self, tag):
        if tag == "a" and self.in_snippet:
            self.in_snippet = False
        if tag == "h2" and self.in_title:
            self.in_title = False
            if self.current_link:
                self.results.append({
                    "title": self.current_title.strip(),
                    "link": self.current_link,
                    "snippet": self.current_snippet.strip()
                })
                self.current_title = ""
                self.current_snippet = ""
                self.current_link = ""
    def handle_data(self, data):
        if self.in_title:
            self.current_title += data
        if self.in_snippet:
            self.current_snippet += data
 class _TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text = [    {
        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]        self.hide = 0
        self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'}
    def handle_starttag(self, tag, attrs):
        if tag in self.ignore_tags:
            self.hide += 1
    def handle_endtag(self, tag):
        if tag in self.ignore_tags:
            self.hide -= 1
    def handle_data(self, data):
        if self.hide == 0:
            cleaned = data.strip()
            if cleaned:
                self.text.append(cleaned)
 def web_search(query: str) -> str:
    """Search the web using DuckDuckGo HTML and return top results."""
    url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query)
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
    try:
        html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
        parser = _DDGParser()
        parser.feed(html)
        if not parser.results:
            return f"No results found for '{query}'"
        lines = [f"Search Results for '{query}':
 "    {
        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]        for i, r in enumerate(parser.results[:5], 1):
            lines.append(f"{i}. {r['title']}
   URL: {r['link']}
   Snippet: {r['snippet']}
 ")
        return "
 ".join(lines)
    except Exception as e:
        return f"ERROR searching web for '{query}': {e}"
 def fetch_url(url: str) -> str:
    """Fetch a URL and return its text content (stripped of HTML tags)."""
    # Correct duckduckgo redirect links if passed
    if url.startswith("//duckduckgo.com/l/?uddg="):
        url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0])
    if not url.startswith("http"):
        url = "https://" + url
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
    try:
        html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
        parser = _TextExtractor()
        parser.feed(html)
        full_text = " ".join(parser.text)
        full_text = _re.sub(r'\s+', ' ', full_text)
        # Limit to 40k chars to prevent context blowup
        if len(full_text) > 40000:
            return full_text[:40000] + "\n... (content truncated)"
        return full_text
    except Exception as e:
        return f"ERROR fetching URL '{url}': {e}"
 # ------------------------------------------------------------------ tool dispatch
-TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"}
+
 TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"}
 def dispatch(tool_name: str, tool_input: dict) -> str:
@@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str:
        return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
    if tool_name == "get_file_summary":
        return get_file_summary(tool_input.get("path", ""))
    if tool_name == "web_search":
        return web_search(tool_input.get("query", ""))
    if tool_name == "fetch_url":
        return fetch_url(tool_input.get("url", ""))
    return f"ERROR: unknown MCP tool '{tool_name}'"
@@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [
            "required": ["path"],
        },
    },
    {
        "name": "web_search",
        "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query."
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "fetch_url",
        "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL to fetch."
                }
            },
            "required": ["url"]
        }
    },
 ]