web search support.
This commit is contained in:
294
mcp_client.py
294
mcp_client.py
@@ -17,6 +17,10 @@
|
||||
|
||||
from pathlib import Path
|
||||
import summarize
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from html.parser import HTMLParser
|
||||
import re as _re
|
||||
|
||||
# ------------------------------------------------------------------ state
|
||||
|
||||
@@ -117,8 +121,35 @@ def list_directory(path: str) -> str:
|
||||
return f"ERROR: not a directory: {path}"
|
||||
try:
|
||||
entries = sorted(p.iterdir(), key=lambda e: (e.is_file(), e.name.lower()))
|
||||
lines = [f"Directory: {p}", ""]
|
||||
for entry in entries:
|
||||
lines = [f"Directory: {p}", "" {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
] for entry in entries:
|
||||
kind = "file" if entry.is_file() else "dir "
|
||||
size = f"{entry.stat().st_size:>10,} bytes" if entry.is_file() else ""
|
||||
lines.append(f" [{kind}] {entry.name:<40} {size}")
|
||||
@@ -142,8 +173,35 @@ def search_files(path: str, pattern: str) -> str:
|
||||
matches = sorted(p.glob(pattern))
|
||||
if not matches:
|
||||
return f"No files matched '{pattern}' in {path}"
|
||||
lines = [f"Search '{pattern}' in {p}:", ""]
|
||||
for m in matches:
|
||||
lines = [f"Search '{pattern}' in {p}:", "" {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
] for m in matches:
|
||||
rel = m.relative_to(p)
|
||||
kind = "file" if m.is_file() else "dir "
|
||||
lines.append(f" [{kind}] {rel}")
|
||||
@@ -173,9 +231,201 @@ def get_file_summary(path: str) -> str:
|
||||
return f"ERROR summarising '{path}': {e}"
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ web tools
|
||||
|
||||
class _DDGParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.results = [ {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
] self.in_result = False
|
||||
self.in_title = False
|
||||
self.in_snippet = False
|
||||
self.current_link = ""
|
||||
self.current_title = ""
|
||||
self.current_snippet = ""
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
if tag == "a" and "result__url" in attrs.get("class", ""):
|
||||
self.current_link = attrs.get("href", "")
|
||||
if tag == "a" and "result__snippet" in attrs.get("class", ""):
|
||||
self.in_snippet = True
|
||||
if tag == "h2" and "result__title" in attrs.get("class", ""):
|
||||
self.in_title = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "a" and self.in_snippet:
|
||||
self.in_snippet = False
|
||||
if tag == "h2" and self.in_title:
|
||||
self.in_title = False
|
||||
if self.current_link:
|
||||
self.results.append({
|
||||
"title": self.current_title.strip(),
|
||||
"link": self.current_link,
|
||||
"snippet": self.current_snippet.strip()
|
||||
})
|
||||
self.current_title = ""
|
||||
self.current_snippet = ""
|
||||
self.current_link = ""
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_title:
|
||||
self.current_title += data
|
||||
if self.in_snippet:
|
||||
self.current_snippet += data
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text = [ {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
] self.hide = 0
|
||||
self.ignore_tags = {'script', 'style', 'head', 'meta', 'nav', 'header', 'footer', 'noscript', 'svg'}
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self.ignore_tags:
|
||||
self.hide += 1
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self.ignore_tags:
|
||||
self.hide -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.hide == 0:
|
||||
cleaned = data.strip()
|
||||
if cleaned:
|
||||
self.text.append(cleaned)
|
||||
|
||||
def web_search(query: str) -> str:
|
||||
"""Search the web using DuckDuckGo HTML and return top results."""
|
||||
url = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote(query)
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
|
||||
try:
|
||||
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
|
||||
parser = _DDGParser()
|
||||
parser.feed(html)
|
||||
if not parser.results:
|
||||
return f"No results found for '{query}'"
|
||||
lines = [f"Search Results for '{query}':
|
||||
" {
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
] for i, r in enumerate(parser.results[:5], 1):
|
||||
lines.append(f"{i}. {r['title']}
|
||||
URL: {r['link']}
|
||||
Snippet: {r['snippet']}
|
||||
")
|
||||
return "
|
||||
".join(lines)
|
||||
except Exception as e:
|
||||
return f"ERROR searching web for '{query}': {e}"
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch a URL and return its text content (stripped of HTML tags)."""
|
||||
# Correct duckduckgo redirect links if passed
|
||||
if url.startswith("//duckduckgo.com/l/?uddg="):
|
||||
url = urllib.parse.unquote(url.split("uddg=")[1].split("&")[0])
|
||||
|
||||
if not url.startswith("http"):
|
||||
url = "https://" + url
|
||||
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'})
|
||||
try:
|
||||
html = urllib.request.urlopen(req, timeout=10).read().decode('utf-8', errors='ignore')
|
||||
parser = _TextExtractor()
|
||||
parser.feed(html)
|
||||
full_text = " ".join(parser.text)
|
||||
full_text = _re.sub(r'\s+', ' ', full_text)
|
||||
# Limit to 40k chars to prevent context blowup
|
||||
if len(full_text) > 40000:
|
||||
return full_text[:40000] + "\n... (content truncated)"
|
||||
return full_text
|
||||
except Exception as e:
|
||||
return f"ERROR fetching URL '{url}': {e}"
|
||||
|
||||
# ------------------------------------------------------------------ tool dispatch
|
||||
|
||||
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary"}
|
||||
|
||||
TOOL_NAMES = {"read_file", "list_directory", "search_files", "get_file_summary", "web_search", "fetch_url"}
|
||||
|
||||
|
||||
def dispatch(tool_name: str, tool_input: dict) -> str:
|
||||
@@ -190,6 +440,10 @@ def dispatch(tool_name: str, tool_input: dict) -> str:
|
||||
return search_files(tool_input.get("path", ""), tool_input.get("pattern", "*"))
|
||||
if tool_name == "get_file_summary":
|
||||
return get_file_summary(tool_input.get("path", ""))
|
||||
if tool_name == "web_search":
|
||||
return web_search(tool_input.get("query", ""))
|
||||
if tool_name == "fetch_url":
|
||||
return fetch_url(tool_input.get("url", ""))
|
||||
return f"ERROR: unknown MCP tool '{tool_name}'"
|
||||
|
||||
|
||||
@@ -272,4 +526,32 @@ MCP_TOOL_SPECS = [
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
]
|
||||
{
|
||||
"name": "web_search",
|
||||
"description": "Search the web using DuckDuckGo. Returns the top 5 search results with titles, URLs, and snippets. Chain this with fetch_url to read specific pages.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The search query."
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "fetch_url",
|
||||
"description": "Fetch a webpage and extract its text content, removing HTML tags and scripts. Useful for reading documentation or articles found via web_search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "The URL to fetch."
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
},
|
||||
]
|
||||
Reference in New Issue
Block a user