web search support.
This commit is contained in:
@@ -35,7 +35,7 @@ MAX_TOOL_ROUNDS = 10
|
|||||||
_ANTHROPIC_CHUNK_SIZE = 180_000
|
_ANTHROPIC_CHUNK_SIZE = 180_000
|
||||||
|
|
||||||
_SYSTEM_PROMPT = (
|
_SYSTEM_PROMPT = (
|
||||||
"You are a helpful coding assistant with access to a PowerShell tool and MCP file tools (read_file, list_directory, search_files, get_file_summary). "
|
"You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
|
||||||
"When asked to create or edit files, prefer targeted edits over full rewrites. "
|
"When asked to create or edit files, prefer targeted edits over full rewrites. "
|
||||||
"Always explain what you are doing before invoking the tool.\n\n"
|
"Always explain what you are doing before invoking the tool.\n\n"
|
||||||
"When writing or rewriting large files (especially those containing quotes, backticks, or special characters), "
|
"When writing or rewriting large files (especially those containing quotes, backticks, or special characters), "
|
||||||
@@ -791,3 +791,4 @@ def send(
|
|||||||
elif _provider == "anthropic":
|
elif _provider == "anthropic":
|
||||||
return _send_anthropic(md_content, user_message, base_dir, file_items)
|
return _send_anthropic(md_content, user_message, base_dir, file_items)
|
||||||
raise ValueError(f"unknown provider: {_provider}")
|
raise ValueError(f"unknown provider: {_provider}")
|
||||||
|
|
||||||
|
|||||||
294
mcp_client.py
294
mcp_client.py
@@ -17,6 +17,10 @@
|
|||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import summarize
|
import summarize
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
import re as _re
|
||||||
|
|
||||||
# ------------------------------------------------------------------ state
|
# ------------------------------------------------------------------ state
|
||||||
|
|
||||||
@@ -117,8 +121,35 @@ def list_directory(path: str) -> str:
|
|||||||
return f"ERROR: not a directory: {path}"
|
return f"ERROR: not a directory: {path}"
|
||||||
try:
|
try:
|
||||||
entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
|
entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
|
||||||
lines = [f"Directory: {p}", ""]
|
lines = [f"Directory: {p}", "" {
|
||||||
for entry in entries:
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
] for entry in entries:
|
||||||
kind = "file" if entry.is_file() else "dir "
|
kind = "file" if entry.is_file() else "dir "
|
||||||
size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
|
size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
|
||||||
lines.append(f" [{kind}] {entry.name:<40} {size}")
|
lines.append(f" [{kind}] {entry.name:<40} {size}")
|
||||||
@@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str:
|
|||||||
matches = sorted(p.glob(pattern))
|
matches = sorted(p.glob(pattern))
|
||||||
if not matches:
|
if not matches:
|
||||||
return f"No files matched '{pattern}' in {path}"
|
return f"No files matched '{pattern}' in {path}"
|
||||||
lines = [f"Search '{pattern}' in {p}:", ""]
|
lines = [f"Search '{pattern}' in {p}:", "" {
|
||||||
for m in matches:
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
] for m in matches:
|
||||||
rel = m.relative_to(p)
|
rel = m.relative_to(p)
|
||||||
kind = "file" if m.is_file() else "dir "
|
kind = "file" if m.is_file() else "dir "
|
||||||
lines.append(f" [{kind}] {rel}")
|
lines.append(f" [{kind}] {rel}")
|
||||||
@@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str:
|
|||||||
return f"ERROR summarising '{path}': {e}"
|
return f"ERROR summarising '{path}': {e}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ web tools
|
||||||
|
|
||||||
|
class _DDGParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.results = [ {
|
||||||
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
] self.in_result = False
|
||||||
|
self.in_title = False
|
||||||
|
self.in_snippet = False
|
||||||
|
self.current_link = ""
|
||||||
|
self.current_title = ""
|
||||||
|
self.current_snippet = ""
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if tag == "a" and "result__url" in attrs.get("class", ""):
|
||||||
|
self.current_link = attrs.get("href", "")
|
||||||
|
if tag == "a" and "result__snippet" in attrs.get("class", ""):
|
||||||
|
self.in_snippet = True
|
||||||
|
if tag == "h2" and "result__title" in attrs.get("class", ""):
|
||||||
|
self.in_title = True
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == "a" and self.in_snippet:
|
||||||
|
self.in_snippet = False
|
||||||
|
if tag == "h2" and self.in_title:
|
||||||
|
self.in_title = False
|
||||||
|
if self.current_link:
|
||||||
|
self.results.append({
|
||||||
|
"title": self.current_title.strip(),
|
||||||
|
"link": self.current_link,
|
||||||
|
"snippet": self.current_snippet.strip()
|
||||||
|
})
|
||||||
|
self.current_title = ""
|
||||||
|
self.current_snippet = ""
|
||||||
|
self.current_link = ""
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.in_title:
|
||||||
|
self.current_title += data
|
||||||
|
if self.in_snippet:
|
||||||
|
self.current_snippet += data
|
||||||
|
|
||||||
|
class _TextExtractor(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.text = [ {
|
||||||
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
] self.hide = 0
|
||||||
|
self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'}
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag in self.ignore_tags:
|
||||||
|
self.hide += 1
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in self.ignore_tags:
|
||||||
|
self.hide -= 1
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.hide == 0:
|
||||||
|
cleaned = data.strip()
|
||||||
|
if cleaned:
|
||||||
|
self.text.append(cleaned)
|
||||||
|
|
||||||
|
def web_search(query: str) -> str:
|
||||||
|
"""Search the web using DuckDuckGo HTML and return top results."""
|
||||||
|
url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query)
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
|
||||||
|
try:
|
||||||
|
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
|
||||||
|
parser = _DDGParser()
|
||||||
|
parser.feed(html)
|
||||||
|
if not parser.results:
|
||||||
|
return f"No results found for '{query}'"
|
||||||
|
lines = [f"Search Results for '{query}':
|
||||||
|
" {
|
||||||
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
] for i, r in enumerate(parser.results[:5], 1):
|
||||||
|
lines.append(f"{i}. {r['title']}
|
||||||
|
URL: {r['link']}
|
||||||
|
Snippet: {r['snippet']}
|
||||||
|
")
|
||||||
|
return "
|
||||||
|
".join(lines)
|
||||||
|
except Exception as e:
|
||||||
|
return f"ERROR searching web for '{query}': {e}"
|
||||||
|
|
||||||
|
def fetch_url(url: str) -> str:
|
||||||
|
"""Fetch a URL and return its text content (stripped of HTML tags)."""
|
||||||
|
# Correct duckduckgo redirect links if passed
|
||||||
|
if url.startswith("//duckduckgo.com/l/?uddg="):
|
||||||
|
url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0])
|
||||||
|
|
||||||
|
if not url.startswith("http"):
|
||||||
|
url = "https://" + url
|
||||||
|
|
||||||
|
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
|
||||||
|
try:
|
||||||
|
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
|
||||||
|
parser = _TextExtractor()
|
||||||
|
parser.feed(html)
|
||||||
|
full_text = " ".join(parser.text)
|
||||||
|
full_text = _re.sub(r'\s+', ' ', full_text)
|
||||||
|
# Limit to 40k chars to prevent context blowup
|
||||||
|
if len(full_text) > 40000:
|
||||||
|
return full_text[:40000] + "\n... (content truncated)"
|
||||||
|
return full_text
|
||||||
|
except Exception as e:
|
||||||
|
return f"ERROR fetching URL '{url}': {e}"
|
||||||
|
|
||||||
# ------------------------------------------------------------------ tool dispatch
|
# ------------------------------------------------------------------ tool dispatch
|
||||||
|
|
||||||
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"}
|
|
||||||
|
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"}
|
||||||
|
|
||||||
|
|
||||||
def dispatch(tool_name: str, tool_input: dict) -> str:
|
def dispatch(tool_name: str, tool_input: dict) -> str:
|
||||||
@@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str:
|
|||||||
return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
|
return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
|
||||||
if tool_name == "get_file_summary":
|
if tool_name == "get_file_summary":
|
||||||
return get_file_summary(tool_input.get("path", ""))
|
return get_file_summary(tool_input.get("path", ""))
|
||||||
|
if tool_name == "web_search":
|
||||||
|
return web_search(tool_input.get("query", ""))
|
||||||
|
if tool_name == "fetch_url":
|
||||||
|
return fetch_url(tool_input.get("url", ""))
|
||||||
return f"ERROR: unknown MCP tool '{tool_name}'"
|
return f"ERROR: unknown MCP tool '{tool_name}'"
|
||||||
|
|
||||||
|
|
||||||
@@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [
|
|||||||
"required": ["path"],
|
"required": ["path"],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
]
|
{
|
||||||
|
"name": "web_search",
|
||||||
|
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The search query."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fetch_url",
|
||||||
|
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The URL to fetch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user