web search support.

This commit is contained in:
2026-02-21 23:17:42 -05:00
parent d7d0583b4e
commit 813297c099
2 changed files with 290 additions and 7 deletions

View File

@@ -35,7 +35,7 @@ MAX_TOOL_ROUNDS = 10
_ANTHROPIC_CHUNK_SIZE = 180_000 _ANTHROPIC_CHUNK_SIZE = 180_000
_SYSTEM_PROMPT = ( _SYSTEM_PROMPT = (
"You are a helpful coding assistant with access to a PowerShell tool and MCP file tools (read_file, list_directory, search_files, get_file_summary). " "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
"When asked to create or edit files, prefer targeted edits over full rewrites. " "When asked to create or edit files, prefer targeted edits over full rewrites. "
"Always explain what you are doing before invoking the tool.\n\n" "Always explain what you are doing before invoking the tool.\n\n"
"When writing or rewriting large files (especially those containing quotes, backticks, or special characters), " "When writing or rewriting large files (especially those containing quotes, backticks, or special characters), "
@@ -791,3 +791,4 @@ def send(
elif _provider == "anthropic": elif _provider == "anthropic":
return _send_anthropic(md_content, user_message, base_dir, file_items) return _send_anthropic(md_content, user_message, base_dir, file_items)
raise ValueError(f"unknown provider: {_provider}") raise ValueError(f"unknown provider: {_provider}")

View File

@@ -17,6 +17,10 @@
from pathlib import Path from pathlib import Path
import summarize import summarize
import urllib.request
import urllib.parse
from html.parser import HTMLParser
import re as _re
# ------------------------------------------------------------------ state # ------------------------------------------------------------------ state
@@ -117,8 +121,35 @@ def list_directory(path: str) -> str:
return f"ERROR: not a directory: {path}" return f"ERROR: not a directory: {path}"
try: try:
entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower())) entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
lines = [f"Directory: {p}", ""] lines = [f"Directory: {p}", "" {
for entry in entries: "name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for entry in entries:
kind = "file" if entry.is_file() else "dir " kind = "file" if entry.is_file() else "dir "
size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else "" size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
lines.append(f" [{kind}] {entry.name:<40} {size}") lines.append(f" [{kind}] {entry.name:<40} {size}")
@@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str:
matches = sorted(p.glob(pattern)) matches = sorted(p.glob(pattern))
if not matches: if not matches:
return f"No files matched '{pattern}' in {path}" return f"No files matched '{pattern}' in {path}"
lines = [f"Search '{pattern}' in {p}:", ""] lines = [f"Search '{pattern}' in {p}:", "" {
for m in matches: "name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for m in matches:
rel = m.relative_to(p) rel = m.relative_to(p)
kind = "file" if m.is_file() else "dir " kind = "file" if m.is_file() else "dir "
lines.append(f" [{kind}] {rel}") lines.append(f" [{kind}] {rel}")
@@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str:
return f"ERROR summarising '{path}': {e}" return f"ERROR summarising '{path}': {e}"
# ------------------------------------------------------------------ web tools
class _DDGParser(HTMLParser):
def __init__(self):
super().__init__()
self.results = [ {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] self.in_result = False
self.in_title = False
self.in_snippet = False
self.current_link = ""
self.current_title = ""
self.current_snippet = ""
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "a" and "result__url" in attrs.get("class", ""):
self.current_link = attrs.get("href", "")
if tag == "a" and "result__snippet" in attrs.get("class", ""):
self.in_snippet = True
if tag == "h2" and "result__title" in attrs.get("class", ""):
self.in_title = True
def handle_endtag(self, tag):
if tag == "a" and self.in_snippet:
self.in_snippet = False
if tag == "h2" and self.in_title:
self.in_title = False
if self.current_link:
self.results.append({
"title": self.current_title.strip(),
"link": self.current_link,
"snippet": self.current_snippet.strip()
})
self.current_title = ""
self.current_snippet = ""
self.current_link = ""
def handle_data(self, data):
if self.in_title:
self.current_title += data
if self.in_snippet:
self.current_snippet += data
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text = [ {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] self.hide = 0
self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'}
def handle_starttag(self, tag, attrs):
if tag in self.ignore_tags:
self.hide += 1
def handle_endtag(self, tag):
if tag in self.ignore_tags:
self.hide -= 1
def handle_data(self, data):
if self.hide == 0:
cleaned = data.strip()
if cleaned:
self.text.append(cleaned)
def web_search(query: str) -> str:
"""Search the web using DuckDuckGo HTML and return top results."""
url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query)
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
try:
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
parser = _DDGParser()
parser.feed(html)
if not parser.results:
return f"No results found for '{query}'"
lines = [f"Search Results for '{query}':
" {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for i, r in enumerate(parser.results[:5], 1):
lines.append(f"{i}. {r['title']}
URL: {r['link']}
Snippet: {r['snippet']}
")
return "
".join(lines)
except Exception as e:
return f"ERROR searching web for '{query}': {e}"
def fetch_url(url: str) -> str:
"""Fetch a URL and return its text content (stripped of HTML tags)."""
# Correct duckduckgo redirect links if passed
if url.startswith("//duckduckgo.com/l/?uddg="):
url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0])
if not url.startswith("http"):
url = "https://" + url
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
try:
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
parser = _TextExtractor()
parser.feed(html)
full_text = " ".join(parser.text)
full_text = _re.sub(r'\s+', ' ', full_text)
# Limit to 40k chars to prevent context blowup
if len(full_text) > 40000:
return full_text[:40000] + "\n... (content truncated)"
return full_text
except Exception as e:
return f"ERROR fetching URL '{url}': {e}"
# ------------------------------------------------------------------ tool dispatch # ------------------------------------------------------------------ tool dispatch
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"}
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"}
def dispatch(tool_name: str, tool_input: dict) -> str: def dispatch(tool_name: str, tool_input: dict) -> str:
@@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str:
return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*")) return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
if tool_name == "get_file_summary": if tool_name == "get_file_summary":
return get_file_summary(tool_input.get("path", "")) return get_file_summary(tool_input.get("path", ""))
if tool_name == "web_search":
return web_search(tool_input.get("query", ""))
if tool_name == "fetch_url":
return fetch_url(tool_input.get("url", ""))
return f"ERROR: unknown MCP tool '{tool_name}'" return f"ERROR: unknown MCP tool '{tool_name}'"
@@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [
"required": ["path"], "required": ["path"],
}, },
}, },
{
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] ]