web search support.

This commit is contained in:
2026-02-21 23:17:42 -05:00
parent d7d0583b4e
commit 813297c099
2 changed files with 290 additions and 7 deletions

View File

@@ -17,6 +17,10 @@
from pathlib import Path
import summarize
import urllib.request
import urllib.parse
from html.parser import HTMLParser
import re as _re
# ------------------------------------------------------------------ state
@@ -117,8 +121,35 @@ def list_directory(path: str) -> str:
return f"ERROR: not a directory: {path}"
try:
entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
lines = [f"Directory: {p}", ""]
for entry in entries:
lines = [f"Directory: {p}", "" {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for entry in entries:
kind = "file" if entry.is_file() else "dir "
size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
lines.append(f" [{kind}] {entry.name:<40} {size}")
@@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str:
matches = sorted(p.glob(pattern))
if not matches:
return f"No files matched '{pattern}' in {path}"
lines = [f"Search '{pattern}' in {p}:", ""]
for m in matches:
lines = [f"Search '{pattern}' in {p}:", "" {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for m in matches:
rel = m.relative_to(p)
kind = "file" if m.is_file() else "dir "
lines.append(f" [{kind}] {rel}")
@@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str:
return f"ERROR summarising '{path}': {e}"
# ------------------------------------------------------------------ web tools
class _DDGParser(HTMLParser):
def __init__(self):
super().__init__()
self.results = [ {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] self.in_result = False
self.in_title = False
self.in_snippet = False
self.current_link = ""
self.current_title = ""
self.current_snippet = ""
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "a" and "result__url" in attrs.get("class", ""):
self.current_link = attrs.get("href", "")
if tag == "a" and "result__snippet" in attrs.get("class", ""):
self.in_snippet = True
if tag == "h2" and "result__title" in attrs.get("class", ""):
self.in_title = True
def handle_endtag(self, tag):
if tag == "a" and self.in_snippet:
self.in_snippet = False
if tag == "h2" and self.in_title:
self.in_title = False
if self.current_link:
self.results.append({
"title": self.current_title.strip(),
"link": self.current_link,
"snippet": self.current_snippet.strip()
})
self.current_title = ""
self.current_snippet = ""
self.current_link = ""
def handle_data(self, data):
if self.in_title:
self.current_title += data
if self.in_snippet:
self.current_snippet += data
class _TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text = [ {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] self.hide = 0
self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'}
def handle_starttag(self, tag, attrs):
if tag in self.ignore_tags:
self.hide += 1
def handle_endtag(self, tag):
if tag in self.ignore_tags:
self.hide -= 1
def handle_data(self, data):
if self.hide == 0:
cleaned = data.strip()
if cleaned:
self.text.append(cleaned)
def web_search(query: str) -> str:
"""Search the web using DuckDuckGo HTML and return top results."""
url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query)
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
try:
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
parser = _DDGParser()
parser.feed(html)
if not parser.results:
return f"No results found for '{query}'"
lines = [f"Search Results for '{query}':
" {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
] for i, r in enumerate(parser.results[:5], 1):
lines.append(f"{i}. {r['title']}
URL: {r['link']}
Snippet: {r['snippet']}
")
return "
".join(lines)
except Exception as e:
return f"ERROR searching web for '{query}': {e}"
def fetch_url(url: str) -> str:
"""Fetch a URL and return its text content (stripped of HTML tags)."""
# Correct duckduckgo redirect links if passed
if url.startswith("//duckduckgo.com/l/?uddg="):
url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0])
if not url.startswith("http"):
url = "https://" + url
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
try:
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
parser = _TextExtractor()
parser.feed(html)
full_text = " ".join(parser.text)
full_text = _re.sub(r'\s+', ' ', full_text)
# Limit to 40k chars to prevent context blowup
if len(full_text) > 40000:
return full_text[:40000] + "\n... (content truncated)"
return full_text
except Exception as e:
return f"ERROR fetching URL '{url}': {e}"
# ------------------------------------------------------------------ tool dispatch
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"}
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"}
def dispatch(tool_name: str, tool_input: dict) -> str:
@@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str:
return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
if tool_name == "get_file_summary":
return get_file_summary(tool_input.get("path", ""))
if tool_name == "web_search":
return web_search(tool_input.get("query", ""))
if tool_name == "fetch_url":
return fetch_url(tool_input.get("url", ""))
return f"ERROR: unknown MCP tool '{tool_name}'"
@@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [
"required": ["path"],
},
},
]
{
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query."
}
},
"required": ["query"]
}
},
{
"name": "fetch_url",
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to fetch."
}
},
"required": ["url"]
}
},
]