diff --git a/ai_client.py b/ai_client.py index d3f1828..277e6f2 100644 --- a/ai_client.py +++ b/ai_client.py @@ -35,7 +35,7 @@ MAX_TOOL_ROUNDS = 10 _ANTHROPIC_CHUNK_SIZE = 180_000 _SYSTEM_PROMPT = ( - "You are a helpful coding assistant with access to a PowerShell tool and MCP file tools (read_file, list_directory, search_files, get_file_summary). " + "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). " "When asked to create or edit files, prefer targeted edits over full rewrites. " "Always explain what you are doing before invoking the tool.\n\n" "When writing or rewriting large files (especially those containing quotes, backticks, or special characters), " @@ -791,3 +791,4 @@ def send( elif _provider == "anthropic": return _send_anthropic(md_content, user_message, base_dir, file_items) raise ValueError(f"unknown provider: {_provider}") + diff --git a/mcp_client.py b/mcp_client.py index 7de1963..025abb6 100644 --- a/mcp_client.py +++ b/mcp_client.py @@ -17,6 +17,10 @@ from pathlib import Path import summarize +import urllib.request +import urllib.parse +from html.parser import HTMLParser +import re as _re # ------------------------------------------------------------------ state @@ -117,8 +121,35 @@ def list_directory(path: str) -> str: return f"ERROR: not a directory: {path}" try: entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower())) - lines = [f"Directory: {p}", ""] - for entry in entries: + lines = [f"Directory: {p}", "" { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] for entry in entries: kind = "file" if entry.is_file() else "dir " size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else "" lines.append(f" [{kind}] {entry.name:<40} {size}") @@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str: matches = sorted(p.glob(pattern)) if not matches: return f"No files matched '{pattern}' in {path}" - lines = [f"Search '{pattern}' in {p}:", ""] - for m in matches: + lines = [f"Search '{pattern}' in {p}:", "" { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] for m in matches: rel = m.relative_to(p) kind = "file" if m.is_file() else "dir " lines.append(f" [{kind}] {rel}") @@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str: return f"ERROR summarising '{path}': {e}" + +# ------------------------------------------------------------------ web tools + +class _DDGParser(HTMLParser): + def __init__(self): + super().__init__() + self.results = [ { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] self.in_result = False + self.in_title = False + self.in_snippet = False + self.current_link = "" + self.current_title = "" + self.current_snippet = "" + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == "a" and "result__url" in attrs.get("class", ""): + self.current_link = attrs.get("href", "") + if tag == "a" and "result__snippet" in attrs.get("class", ""): + self.in_snippet = True + if tag == "h2" and "result__title" in attrs.get("class", ""): + self.in_title = True + + def handle_endtag(self, tag): + if tag == "a" and self.in_snippet: + self.in_snippet = False + if tag == "h2" and self.in_title: + self.in_title = False + if self.current_link: + self.results.append({ + "title": self.current_title.strip(), + "link": self.current_link, + "snippet": self.current_snippet.strip() + }) + self.current_title = "" + self.current_snippet = "" + self.current_link = "" + + def handle_data(self, data): + if self.in_title: + self.current_title += data + if self.in_snippet: + self.current_snippet += data + +class _TextExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.text = [ { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] self.hide = 0 + self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'} + + def handle_starttag(self, tag, attrs): + if tag in self.ignore_tags: + self.hide += 1 + + def handle_endtag(self, tag): + if tag in self.ignore_tags: + self.hide -= 1 + + def handle_data(self, data): + if self.hide == 0: + cleaned = data.strip() + if cleaned: + self.text.append(cleaned) + +def web_search(query: str) -> str: + """Search the web using DuckDuckGo HTML and return top results.""" + url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query) + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}) + try: + html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore') + parser = _DDGParser() + parser.feed(html) + if not parser.results: + return f"No results found for '{query}'" + lines = [f"Search Results for '{query}': +" { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] for i, r in enumerate(parser.results[:5], 1): + lines.append(f"{i}. {r['title']} + URL: {r['link']} + Snippet: {r['snippet']} +") + return " +".join(lines) + except Exception as e: + return f"ERROR searching web for '{query}': {e}" + +def fetch_url(url: str) -> str: + """Fetch a URL and return its text content (stripped of HTML tags).""" + # Correct duckduckgo redirect links if passed + if url.startswith("//duckduckgo.com/l/?uddg="): + url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0]) + + if not url.startswith("http"): + url = "https://" + url + + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}) + try: + html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore') + parser = _TextExtractor() + parser.feed(html) + full_text = " ".join(parser.text) + full_text = _re.sub(r'\s+', ' ', full_text) + # Limit to 40k chars to prevent context blowup + if len(full_text) > 40000: + return full_text[:40000] + "\n... (content truncated)" + return full_text + except Exception as e: + return f"ERROR fetching URL '{url}': {e}" + # ------------------------------------------------------------------ tool dispatch -TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"} + +TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"} def dispatch(tool_name: str, tool_input: dict) -> str: @@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str: return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*")) if tool_name == "get_file_summary": return get_file_summary(tool_input.get("path", "")) + if tool_name == "web_search": + return web_search(tool_input.get("query", "")) + if tool_name == "fetch_url": + return fetch_url(tool_input.get("url", "")) return f"ERROR: unknown MCP tool '{tool_name}'" @@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [ "required": ["path"], }, }, -] + { + "name": "web_search", + "description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query." + } + }, + "required": ["query"] + } + }, + { + "name": "fetch_url", + "description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL to fetch." + } + }, + "required": ["url"] + } + }, +] \ No newline at end of file