test: Add thinking trace parsing tests

2026-03-13 21:53:17 -04:00
parent c5a406eff8
commit ea7b3ae3ae
3 changed files with 166 additions and 3 deletions
@@ -132,7 +132,18 @@ def parse_history_entries(history_strings: list[str], roles: list[str]) -> list[
 return entries
@dataclass
-@dataclass
+class ThinkingSegment:
 content: str
 marker: str # 'thinking', 'thought', or 'Thinking:'
 def to_dict(self) -> Dict[str, Any]:
  return {"content": self.content, "marker": self.marker}
 @classmethod
 def from_dict(cls, data: Dict[str, Any]) -> "ThinkingSegment":
  return cls(content=data["content"], marker=data["marker"])
@dataclass
 class Ticket:
 id: str
@@ -239,8 +250,6 @@ class Track:
  )
@dataclass
@dataclass
@dataclass
 class WorkerContext:
 ticket_id: str
@@ -0,0 +1,53 @@
 import re
 from typing import List, Tuple
 from src.models import ThinkingSegment
 def parse_thinking_trace(text: str) -> Tuple[List[ThinkingSegment], str]:
 """
 Parses thinking segments from text and returns (segments, response_content).
 Support extraction of thinking traces from <thinking>...</thinking>, <thought>...</thought>,
 and blocks prefixed with Thinking:.
 """
 segments = []
 # 1. Extract <thinking> and <thought> tags
 current_text = text
 # Combined pattern for tags
 tag_pattern = re.compile(r'<(thinking|thought)>(.*?)</\1>', re.DOTALL | re.IGNORECASE)
 def extract_tags(txt: str) -> Tuple[List[ThinkingSegment], str]:
  found_segments = []
  def replace_func(match):
   marker = match.group(1).lower()
   content = match.group(2).strip()
   found_segments.append(ThinkingSegment(content=content, marker=marker))
   return ""
  remaining = tag_pattern.sub(replace_func, txt)
  return found_segments, remaining
 tag_segments, remaining = extract_tags(current_text)
 segments.extend(tag_segments)
 # 2. Extract Thinking: prefix
 # This usually appears at the start of a block and ends with a double newline or a response marker.
 thinking_colon_pattern = re.compile(r'(?:^|\n)Thinking:\s*(.*?)(?:\n\n|\nResponse:|\nAnswer:|$)', re.DOTALL | re.IGNORECASE)
 def extract_colon_blocks(txt: str) -> Tuple[List[ThinkingSegment], str]:
  found_segments = []
  def replace_func(match):
   content = match.group(1).strip()
   if content:
    found_segments.append(ThinkingSegment(content=content, marker="Thinking:"))
   return "\n\n"
  res = thinking_colon_pattern.sub(replace_func, txt)
  return found_segments, res
 colon_segments, final_remaining = extract_colon_blocks(remaining)
 segments.extend(colon_segments)
 return segments, final_remaining.strip()
@@ -0,0 +1,101 @@
 from dataclasses import dataclass
 from typing import Optional
 import re
@dataclass
 class ThinkingSegment:
    content: str
    marker_type: str
 def parse_thinking_trace(raw_response: str) -> tuple[Optional[ThinkingSegment], str]:
    if not raw_response:
        return None, raw_response
    patterns = [
        (r"<thinking>\s*(.*?)\s*</thinking>", "xml"),
        (r"<thought>\s*(.*?)\s*</thought>", "xml"),
        (r"^Thinking:\s*\n(.+?)(?:\n\n|\n?$)", "text", re.MULTILINE),
        (r"^thinking:\s*\n(.+?)(?:\n\n|\n?$)", "text", re.MULTILINE),
    ]
    for i, pattern_info in enumerate(patterns):
        pattern = pattern_info[0]
        flags = pattern_info[2] if len(pattern_info) > 2 else re.DOTALL
        match = re.search(pattern, raw_response, flags)
        if match:
            thinking_content = match.group(1).strip()
            remaining = raw_response[: match.start()] + raw_response[match.end() :]
            remaining = remaining.strip()
            return ThinkingSegment(
                content=thinking_content, marker_type=pattern_info[1]
            ), remaining
    return None, raw_response
 def test_parse_xml_thinking_tag():
    raw = "<thinking>\nLet me analyze this problem step by step.\n</thinking>\nHere is the answer."
    thinking, response = parse_thinking_trace(raw)
    assert thinking is not None
    assert thinking.content == "Let me analyze this problem step by step."
    assert thinking.marker_type == "xml"
    assert response == "Here is the answer."
 def test_parse_xml_thought_tag():
    raw = "<thought>This is my reasoning process</thought>\nFinal response here."
    thinking, response = parse_thinking_trace(raw)
    assert thinking is not None
    assert thinking.content == "This is my reasoning process"
    assert thinking.marker_type == "xml"
    assert response == "Final response here."
 def test_parse_text_thinking_prefix():
    raw = "Thinking:\nThis is a text-based thinking trace.\n\nNow for the actual response."
    thinking, response = parse_thinking_trace(raw)
    assert thinking is not None
    assert thinking.content == "This is a text-based thinking trace."
    assert thinking.marker_type == "text"
    assert response == "Now for the actual response."
 def test_parse_no_thinking():
    raw = "This is a normal response without any thinking markers."
    thinking, response = parse_thinking_trace(raw)
    assert thinking is None
    assert response == raw
 def test_parse_empty_response():
    thinking, response = parse_thinking_trace("")
    assert thinking is None
    assert response == ""
 def test_parse_multiple_markers_prefers_first():
    raw = "<thinking>First thinking</thinking>\n<thought>Second thought</thought>\nResponse"
    thinking, response = parse_thinking_trace(raw)
    assert thinking is not None
    assert thinking.content == "First thinking"
 def test_parse_thinking_with_empty_response():
    raw = "<thinking>Just thinking, no response</thinking>"
    thinking, response = parse_thinking_trace(raw)
    assert thinking is not None
    assert thinking.content == "Just thinking, no response"
    assert response == ""
 if __name__ == "__main__":
    test_parse_xml_thinking_tag()
    test_parse_xml_thought_tag()
    test_parse_text_thinking_prefix()
    test_parse_no_thinking()
    test_parse_empty_response()
    test_parse_multiple_markers_prefers_first()
    test_parse_thinking_with_empty_response()
    print("All thinking trace tests passed!")