doc_weaver

Providing a md document hydration and transformation toolkit.

Utilities for parsing structured (and interacting programmatically) with Markdown documents, hydrating templates with AI-generated content, and applying text transformations.

View Source

 1"""Providing a md document hydration and transformation toolkit.
 2
 3Utilities for parsing structured (and interacting programmatically) with Markdown documents, hydrating
 4templates with AI-generated content, and applying text transformations.
 5"""
 6
 7from doc_weaver.document import Document, SubSection, Content
 8from doc_weaver.parser import load_markdown, ValidationError
 9from doc_weaver.hydrate_queue import hydrate, HydrateQueue
10from doc_weaver.hydrate_batch import hydrate_item
11from doc_weaver.text_morpher import simple_morph
12
13__all__ = [
14    "Document",
15    "SubSection",
16    "Content",
17    "load_markdown",
18    "ValidationError",
19    "hydrate",
20    "HydrateQueue",
21    "hydrate_item",
22    "simple_morph",
23]

class Document: View Source

129class Document():
130    """A top-level document with header, tagline, and hierarchical sections.
131
132    Document represents a complete markdown document with a title (H1), tagline
133    (blockquote), and a collection of sections. Each section is a dictionary
134    mapping section titles (rendered as H2) to lists of `SubSection` instances.
135
136    The model provides convenience methods for creating sections, subsections,
137    and content items, as well as a `preview` method that renders the entire
138    document as markdown.
139
140    Attributes:
141        id: A unique identifier for this document. Auto-generated as a UUID
142            string if not provided.
143        sections: A dictionary mapping section titles to lists of `SubSection`
144            instances.
145        header: The document header (rendered as an H1 heading in markdown).
146        tagline: The document tagline (rendered as a blockquote in markdown).
147
148    Example:
149        ```python
150        doc = Document(header="Q1 Report", tagline="Summary of achievements")
151        doc.create_section("Results")
152        doc.create_subsection("Results", ["Revenue", "Costs"])
153
154        # Get the first subsection's ID to add content
155        revenue_id = doc.sections["Results"][0].id
156        doc.create_content("Results", revenue_id, "Up 20%")
157
158        markdown = doc.preview()
159        ```
160    """
161    def __init__(self, header: str, tagline: str, sections = None, id=None) -> None:
162        """Initialize a new Document.
163
164        Args:
165            header: The document header (H1 title).
166            tagline: The document tagline (blockquote).
167            sections: Optional initial sections dictionary. Defaults to an
168                empty dictionary.
169            id: Optional identifier. If not provided, a new UUID is generated.
170        """
171        self.id = id if id else str(uuid4())
172        self.sections = sections if sections is not None else {}
173        self.header = header
174        self.tagline = tagline
175
176    def create_section(self, title: str) -> None:
177        """Create a new section in the document.
178
179        If a section with the given title already exists, it will be
180        overwritten with an empty list.
181
182        Args:
183            title: The section title (will be rendered as an H2 heading).
184        """
185        self.sections[title] = []
186
187    def create_subsection(self, section_title: str, subsection_titles: Union[str, List[str]]) -> None:
188        """Create one or more subsections within a section.
189
190        If the section does not exist, it will be created automatically.
191        Each subsection is initialized with an empty list of content items.
192
193        Args:
194            section_title: The parent section title.
195            subsection_titles: A single subsection title string or a list of
196                subsection title strings.
197        """
198        if section_title not in self.sections:
199            self.sections[section_title] = []
200        if isinstance(subsection_titles, str):
201            subsection_titles = [subsection_titles]
202        for title in subsection_titles:
203            new_subsection = SubSection(title=title, items=[])
204            self.sections[section_title].append(new_subsection)
205
206    def create_content(self, section_title: str, subsection_id: str, text: str) -> None:
207        """Create a content item within a specific subsection.
208
209        Locates the subsection by ID within the specified section and adds
210        a new `Content` instance with the provided text. If the section does
211        not exist, it will be created. If the subsection ID is not found,
212        the method returns without adding content.
213
214        Args:
215            section_title: The parent section title.
216            subsection_id: The unique identifier of the target subsection.
217            text: The text content to add.
218        """
219        if section_title not in self.sections:
220            self.sections[section_title] = []
221        for subsection in self.sections[section_title]:
222            if subsection.id == subsection_id:
223                subsection.add_content(Content(text))
224                return
225
226    def preview(self) -> str:
227        """Render the entire document as markdown.
228
229        Returns:
230            A markdown string with H1 header, blockquote tagline, H2 sections,
231            and H3 subsections with bulleted content items.
232        """
233        preview = f"# {self.header}\n\n> {self.tagline}\n\n"
234        for key, section in self.sections.items():
235            preview += f"## {key}\n"
236            for sec in section:
237                preview += str(sec) + "\n"
238
239        return preview

A top-level document with header, tagline, and hierarchical sections.

Document represents a complete markdown document with a title (H1), tagline (blockquote), and a collection of sections. Each section is a dictionary mapping section titles (rendered as H2) to lists of SubSection instances.

The model provides convenience methods for creating sections, subsections, and content items, as well as a preview method that renders the entire document as markdown.

Attributes: id: A unique identifier for this document. Auto-generated as a UUID string if not provided. sections: A dictionary mapping section titles to lists of SubSection instances. header: The document header (rendered as an H1 heading in markdown). tagline: The document tagline (rendered as a blockquote in markdown).

Example:


doc = Document(header="Q1 Report", tagline="Summary of achievements")
doc.create_section("Results")
doc.create_subsection("Results", ["Revenue", "Costs"])

# Get the first subsection's ID to add content
revenue_id = doc.sections["Results"][0].id
doc.create_content("Results", revenue_id, "Up 20%")

markdown = doc.preview()

Document(header: str, tagline: str, sections=None, id=None) View Source

161    def __init__(self, header: str, tagline: str, sections = None, id=None) -> None:
162        """Initialize a new Document.
163
164        Args:
165            header: The document header (H1 title).
166            tagline: The document tagline (blockquote).
167            sections: Optional initial sections dictionary. Defaults to an
168                empty dictionary.
169            id: Optional identifier. If not provided, a new UUID is generated.
170        """
171        self.id = id if id else str(uuid4())
172        self.sections = sections if sections is not None else {}
173        self.header = header
174        self.tagline = tagline

Initialize a new Document.

Args: header: The document header (H1 title). tagline: The document tagline (blockquote). sections: Optional initial sections dictionary. Defaults to an empty dictionary. id: Optional identifier. If not provided, a new UUID is generated.

sections

tagline

def create_section(self, title: str) -> None: View Source

176    def create_section(self, title: str) -> None:
177        """Create a new section in the document.
178
179        If a section with the given title already exists, it will be
180        overwritten with an empty list.
181
182        Args:
183            title: The section title (will be rendered as an H2 heading).
184        """
185        self.sections[title] = []

Create a new section in the document.

If a section with the given title already exists, it will be overwritten with an empty list.

Args: title: The section title (will be rendered as an H2 heading).

def create_subsection( self, section_title: str, subsection_titles: Union[str, List[str]]) -> None: View Source

187    def create_subsection(self, section_title: str, subsection_titles: Union[str, List[str]]) -> None:
188        """Create one or more subsections within a section.
189
190        If the section does not exist, it will be created automatically.
191        Each subsection is initialized with an empty list of content items.
192
193        Args:
194            section_title: The parent section title.
195            subsection_titles: A single subsection title string or a list of
196                subsection title strings.
197        """
198        if section_title not in self.sections:
199            self.sections[section_title] = []
200        if isinstance(subsection_titles, str):
201            subsection_titles = [subsection_titles]
202        for title in subsection_titles:
203            new_subsection = SubSection(title=title, items=[])
204            self.sections[section_title].append(new_subsection)

Create one or more subsections within a section.

If the section does not exist, it will be created automatically. Each subsection is initialized with an empty list of content items.

Args: section_title: The parent section title. subsection_titles: A single subsection title string or a list of subsection title strings.

def create_content(self, section_title: str, subsection_id: str, text: str) -> None: View Source

206    def create_content(self, section_title: str, subsection_id: str, text: str) -> None:
207        """Create a content item within a specific subsection.
208
209        Locates the subsection by ID within the specified section and adds
210        a new `Content` instance with the provided text. If the section does
211        not exist, it will be created. If the subsection ID is not found,
212        the method returns without adding content.
213
214        Args:
215            section_title: The parent section title.
216            subsection_id: The unique identifier of the target subsection.
217            text: The text content to add.
218        """
219        if section_title not in self.sections:
220            self.sections[section_title] = []
221        for subsection in self.sections[section_title]:
222            if subsection.id == subsection_id:
223                subsection.add_content(Content(text))
224                return

Create a content item within a specific subsection.

Locates the subsection by ID within the specified section and adds a new Content instance with the provided text. If the section does not exist, it will be created. If the subsection ID is not found, the method returns without adding content.

Args: section_title: The parent section title. subsection_id: The unique identifier of the target subsection. text: The text content to add.

def preview(self) -> str: View Source

226    def preview(self) -> str:
227        """Render the entire document as markdown.
228
229        Returns:
230            A markdown string with H1 header, blockquote tagline, H2 sections,
231            and H3 subsections with bulleted content items.
232        """
233        preview = f"# {self.header}\n\n> {self.tagline}\n\n"
234        for key, section in self.sections.items():
235            preview += f"## {key}\n"
236            for sec in section:
237                preview += str(sec) + "\n"
238
239        return preview

Render the entire document as markdown.

Returns: A markdown string with H1 header, blockquote tagline, H2 sections, and H3 subsections with bulleted content items.

class SubSection: View Source

 62class SubSection():
 63    """A titled subsection containing a list of content items.
 64
 65    SubSection represents a markdown H3 heading followed by a bulleted list
 66    of `Content` items. Each subsection is assigned a unique identifier for
 67    referencing within the parent section.
 68
 69    Attributes:
 70        title: The subsection title (rendered as an H3 heading in markdown).
 71        items: A list of `Content` instances contained in this subsection.
 72        id: A unique identifier for this subsection. Auto-generated as a
 73            UUID string if not provided.
 74
 75    Example:
 76        ```python
 77        subsection = SubSection(title="Key Findings")
 78        subsection.add_content(Content("Finding 1"))
 79        subsection.add_content(Content("Finding 2"))
 80
 81        print(str(subsection))
 82        # ### Key Findings
 83        # - Finding 1
 84        # - Finding 2
 85        ```
 86    """
 87    title: str
 88    items: List[Content]
 89    id: str
 90
 91    def __init__(self, title: str, items: List[Content] = None, id=None) -> None:
 92        """Initialize a new SubSection.
 93
 94        Args:
 95            title: The subsection title.
 96            items: Optional initial list of `Content` instances. Defaults to
 97                an empty list.
 98            id: Optional identifier. If not provided, a new UUID is generated.
 99        """
100        self.id = id if id else str(uuid4())
101        self.title = title
102        self.items = items if items is not None else []
103
104    def add_content(self, item: Content, index=None) -> None:
105        """Add a content item to this subsection.
106
107        Args:
108            item: The `Content` instance to add.
109            index: Optional position to insert the item. If not provided,
110                the item is appended to the end of the list.
111        """
112        if index is not None:
113            self.items.insert(index, item)
114        else:
115            self.items.append(item)
116
117    def __str__(self) -> str:
118        """Render the subsection as markdown.
119
120        Returns:
121            A markdown string with an H3 title followed by bulleted list items.
122        """
123        preview = f"### {self.title}\n"
124        for item in self.items:
125            preview += f"- {str(item)}\n"
126        return preview

A titled subsection containing a list of content items.

SubSection represents a markdown H3 heading followed by a bulleted list of Content items. Each subsection is assigned a unique identifier for referencing within the parent section.

Attributes: title: The subsection title (rendered as an H3 heading in markdown). items: A list of Content instances contained in this subsection. id: A unique identifier for this subsection. Auto-generated as a UUID string if not provided.

Example:


subsection = SubSection(title="Key Findings")
subsection.add_content(Content("Finding 1"))
subsection.add_content(Content("Finding 2"))

print(str(subsection))
# ### Key Findings
# - Finding 1
# - Finding 2

SubSection(title: str, items: List[Content] = None, id=None) View Source

 91    def __init__(self, title: str, items: List[Content] = None, id=None) -> None:
 92        """Initialize a new SubSection.
 93
 94        Args:
 95            title: The subsection title.
 96            items: Optional initial list of `Content` instances. Defaults to
 97                an empty list.
 98            id: Optional identifier. If not provided, a new UUID is generated.
 99        """
100        self.id = id if id else str(uuid4())
101        self.title = title
102        self.items = items if items is not None else []

Initialize a new SubSection.

Args: title: The subsection title. items: Optional initial list of Content instances. Defaults to an empty list. id: Optional identifier. If not provided, a new UUID is generated.

title: str

items: List[Content]

id: str

def add_content(self, item: Content, index=None) -> None: View Source

104    def add_content(self, item: Content, index=None) -> None:
105        """Add a content item to this subsection.
106
107        Args:
108            item: The `Content` instance to add.
109            index: Optional position to insert the item. If not provided,
110                the item is appended to the end of the list.
111        """
112        if index is not None:
113            self.items.insert(index, item)
114        else:
115            self.items.append(item)

Add a content item to this subsection.

Args: item: The Content instance to add. index: Optional position to insert the item. If not provided, the item is appended to the end of the list.

class Content: View Source

20class Content():
21    """A single text content item with a unique identifier.
22
23    Content represents the atomic unit of text in a document. Each instance
24    is assigned a UUID-based identifier for tracking and referencing within
25    subsections.
26
27    Attributes:
28        text: The text content of this item.
29        id: A unique identifier for this content item. Auto-generated as a
30            UUID string if not provided.
31
32    Example:
33        ```python
34        content = Content(text="Revenue increased by 15% year-over-year.")
35        print(content.id)  # "a1b2c3d4-..."
36        print(str(content))  # "Revenue increased by 15% year-over-year."
37        ```
38    """
39    text: str
40    id: str
41
42    def __init__(self, text: str, id = None) -> None:
43        """Initialize a new Content item.
44
45        Args:
46            text: The text content.
47            id: Optional identifier. If not provided, a new UUID is generated.
48        """
49        self.id = id if id else str(uuid4())
50        self.text = text
51
52    def __str__(self) -> str:
53        """Return the text content as a string.
54
55        Returns:
56            The text attribute if non-empty, otherwise an empty string.
57        """
58        if self.text:
59            return self.text
60        return ""

A single text content item with a unique identifier.

Content represents the atomic unit of text in a document. Each instance is assigned a UUID-based identifier for tracking and referencing within subsections.

Attributes: text: The text content of this item. id: A unique identifier for this content item. Auto-generated as a UUID string if not provided.

Example:


content = Content(text="Revenue increased by 15% year-over-year.")
print(content.id)  # "a1b2c3d4-..."
print(str(content))  # "Revenue increased by 15% year-over-year."

Content(text: str, id=None) View Source

42    def __init__(self, text: str, id = None) -> None:
43        """Initialize a new Content item.
44
45        Args:
46            text: The text content.
47            id: Optional identifier. If not provided, a new UUID is generated.
48        """
49        self.id = id if id else str(uuid4())
50        self.text = text

Initialize a new Content item.

Args: text: The text content. id: Optional identifier. If not provided, a new UUID is generated.

text: str

id: str

def load_markdown(markdown: str, check_todo: bool = False) -> Document: View Source

 35def load_markdown(markdown: str, check_todo: bool = False) -> Document:
 36    """Parse a structured markdown string into a Document object.
 37
 38    Parses markdown text with a strict hierarchical structure. The
 39    parser expects the following structure:
 40
 41    1. Title line starting with `# `
 42    2. Tagline starting with `> `
 43    3. Sections starting with `## `
 44    4. Subsections starting with `### `
 45    5. Bullet content starting with `- `
 46
 47    The function validates structural integrity (e.g., subsections must belong
 48    to a section, content must belong to a subsection) and optionally ensures that the
 49    rendered preview contains exactly one `<TODO>` placeholder as the sole
 50    non-markdown content on its line.
 51
 52    Args:
 53        markdown: A markdown-formatted string adhering to the expected structure.
 54            Must contain a title, tagline, at least one section with subsections
 55            and content, and exactly one `<TODO>` placeholder.
 56        check_todo: If True, enforces the presence of exactly one `<TODO>`
 57
 58    Returns:
 59        A `Document` object with parsed header, tagline, sections, subsections,
 60        and content items.
 61
 62    Raises:
 63        ValidationError: If the markdown does not start with a title (`# `).
 64        ValidationError: If the title is not followed by a tagline (`> `).
 65        ValidationError: If a subsection is found before any section.
 66        ValidationError: If content is found before any subsection.
 67        ValidationError: If a line does not match the expected format.
 68        ValidationError: If the preview does not contain exactly one `<TODO>` and check_todo is True.
 69        ValidationError: If `<TODO>` is not the only non-markdown content on its line and check_todo is True.
 70
 71    Example:
 72        ```python
 73        markdown = '''
 74        # Project Proposal
 75        > A groundbreaking new idea
 76
 77        ## Overview
 78        ### Summary
 79        - This project aims to solve X
 80        - <TODO>
 81        '''
 82
 83        doc = load_markdown(markdown)
 84        print(doc.header)  # "Project Proposal"
 85        print(doc.tagline)  # "A groundbreaking new idea"
 86        ```
 87    """
 88    lines = markdown.strip().split('\n')
 89    
 90    # Validate title (must be first non-empty line)
 91    if not lines or not lines[0].startswith('# '):
 92        raise ValidationError("Document must start with a title (# Title)")
 93    
 94    header = lines[0][2:].strip()
 95    
 96    # Validate tagline (must be second non-empty line)
 97    idx = 1
 98    while idx < len(lines) and not lines[idx].strip():
 99        idx += 1
100    
101    if idx >= len(lines) or not lines[idx].startswith('> '):
102        raise ValidationError("Title must be followed by a tagline (> Tagline)")
103    
104    tagline = lines[idx][2:].strip()
105    idx += 1
106    
107    doc = Document(header=header, tagline=tagline)
108    current_section = None
109    current_subsection = None
110    
111    while idx < len(lines):
112        line = lines[idx].strip()
113        
114        if not line:
115            idx += 1
116            continue
117            
118        if line.startswith('## '):
119            current_section = line[3:].strip()
120            doc.create_section(current_section)
121            current_subsection = None
122            
123        elif line.startswith('### '):
124            if current_section is None:
125                raise ValidationError("Subsection found before any section")
126            subsection_title = line[4:].strip()
127            doc.create_subsection(current_section, subsection_title)
128            current_subsection = doc.sections[current_section][-1]
129            
130        elif line.startswith('- '):
131            if current_subsection is None:
132                raise ValidationError("Content found before any subsection")
133            content_text = line[2:].strip()
134            current_subsection.add_content(Content(content_text))
135            
136        else:
137            raise ValidationError(f"Invalid line format: '{line}'. Expected ##, ###, or -")
138        
139        idx += 1
140    
141    if check_todo:
142        # Validate exactly one <TODO> on its own line
143        preview = doc.preview()
144        
145        todo_count = preview.count('<TODO>')
146        
147        if todo_count != 1:
148            print(preview)
149            raise ValidationError(f"Preview must contain exactly one <TODO>, found {todo_count}")
150        
151        # Check that <TODO> is alone on its line (aside from markdown markers)
152        preview_lines = preview.split('\n')
153        todo_line_found = False
154        for line in preview_lines:
155            if '<TODO>' in line:
156                # Strip markdown markers (-, ##, ###, >, #, whitespace)
157                stripped = line.lstrip('#').lstrip('>').lstrip('-').strip()
158                if stripped != '<TODO>':
159                    raise ValidationError(f"<TODO> must be the only non-markdown content on its line, found: '{line.strip()}'")
160                todo_line_found = True
161                break
162        
163        if not todo_line_found:
164            raise ValidationError("Preview must contain exactly one <TODO>")
165    
166    return doc

Parse a structured markdown string into a Document object.

Parses markdown text with a strict hierarchical structure. The parser expects the following structure:

Title line starting with #
Tagline starting with >
Sections starting with ##
Subsections starting with ###
Bullet content starting with -

The function validates structural integrity (e.g., subsections must belong to a section, content must belong to a subsection) and optionally ensures that the rendered preview contains exactly one <TODO> placeholder as the sole non-markdown content on its line.

Args: markdown: A markdown-formatted string adhering to the expected structure. Must contain a title, tagline, at least one section with subsections and content, and exactly one <TODO> placeholder. check_todo: If True, enforces the presence of exactly one <TODO>

Returns: A Document object with parsed header, tagline, sections, subsections, and content items.

Raises: ValidationError: If the markdown does not start with a title (#). ValidationError: If the title is not followed by a tagline (>). ValidationError: If a subsection is found before any section. ValidationError: If content is found before any subsection. ValidationError: If a line does not match the expected format. ValidationError: If the preview does not contain exactly one <TODO> and check_todo is True. ValidationError: If <TODO> is not the only non-markdown content on its line and check_todo is True.

Example:


markdown = '''
# Project Proposal
> A groundbreaking new idea

## Overview
### Summary
- This project aims to solve X
- <TODO>
'''

doc = load_markdown(markdown)
print(doc.header)  # "Project Proposal"
print(doc.tagline)  # "A groundbreaking new idea"

class ValidationError(builtins.Exception): View Source

31class ValidationError(Exception):
32    """Raised when markdown doesn't conform to expected structure"""
33    pass

Raised when markdown doesn't conform to expected structure

async def hydrate( markdown: str, context: str = '', timeout: int = 30, model: str = 'gpt-4o', contexts: dict[str, str] | None = None) -> tuple[str, dict]: View Source

205async def hydrate(markdown: str, context: str = "", timeout: int = 30, model: str = "gpt-4o", contexts: dict[str, str] | None = None) -> tuple[str, dict]:
206    """Resolves all <A, B, C, context_id> placeholders in a markdown document.
207
208    Processes batches sequentially (lower A first), running all items within
209    a batch concurrently. Each batch sees the results of all previous batches
210    in the document context.
211
212    Args:
213        markdown: A markdown string containing placeholders.
214        context: Optional global context/instructions to include alongside
215                 the document preview for every placeholder.
216        timeout: Maximum time in seconds to wait for each batch to complete.
217        model: The LLM model to use for text generation.
218        contexts: Optional mapping of context ID to context text. Tasks that
219                  reference a context_id will have the corresponding text
220                  prepended to their content in addition to the global context.
221
222    Returns:
223        A tuple of (hydrated_markdown, metadata) where metadata is a dict with:
224            - tasks: list of per-task metadata dicts
225            - total_elapsed_ms: total wall-clock time in milliseconds
226            - model: the model used
227    """
228    if contexts is None:
229        contexts = {}
230
231    queue = HydrateQueue(markdown)
232
233    # Validate that all referenced context IDs exist
234    missing = [t.context_id for t in queue._tasks if t.context_id and t.context_id not in contexts]
235    if missing:
236        unique_missing = sorted(set(missing))
237        raise ValueError(f"Missing context(s): {', '.join(unique_missing)}. Add them with `doc-weaver context add`.")
238
239    task_metadata = []
240    global_task_number = 0
241    total_start = time.time()
242
243    while not queue.done:
244        batch_num = queue.current_batch_number
245        batch = queue.next_batch()
246        batch_tasks = [t for t in queue._tasks if t.batch == batch_num]
247
248        coros = [
249            hydrate_item(doc, min_c, max_c, context, model, task_context=contexts.get(ctx_id, "") if ctx_id else "")
250            for doc, min_c, max_c, ctx_id in batch
251        ]
252        results_with_timing = await asyncio.wait_for(asyncio.gather(*coros), timeout=timeout)
253
254        replacements = []
255        for i, (text, elapsed_ms) in enumerate(results_with_timing):
256            task = batch_tasks[i]
257            task_metadata.append({
258                "task_number": global_task_number,
259                "marker": task.marker,
260                "batch_num": batch_num,
261                "char_range": [task.min_chars, task.max_chars],
262                "total_chars": len(text),
263                "elapsed_ms": round(elapsed_ms, 2),
264                "model": model,
265                "context_id": task.context_id,
266            })
267            replacements.append(text)
268            global_task_number += 1
269
270        queue.submit_results(replacements)
271
272    total_elapsed_ms = (time.time() - total_start) * 1000
273
274    # The marker document shows <<TASK_N>> placeholders instead of content
275    marker_doc = queue._inject_markers(queue._original_markdown)
276
277    metadata = {
278        "tasks": task_metadata,
279        "total_elapsed_ms": round(total_elapsed_ms, 2),
280        "model": model,
281        "marker_document": marker_doc,
282    }
283
284    return queue.current_markdown, metadata

Resolves all placeholders in a markdown document.

Processes batches sequentially (lower A first), running all items within a batch concurrently. Each batch sees the results of all previous batches in the document context.

Args: markdown: A markdown string containing placeholders. context: Optional global context/instructions to include alongside the document preview for every placeholder. timeout: Maximum time in seconds to wait for each batch to complete. model: The LLM model to use for text generation. contexts: Optional mapping of context ID to context text. Tasks that reference a context_id will have the corresponding text prepended to their content in addition to the global context.

Returns: A tuple of (hydrated_markdown, metadata) where metadata is a dict with: - tasks: list of per-task metadata dicts - total_elapsed_ms: total wall-clock time in milliseconds - model: the model used

class HydrateQueue: View Source

 55class HydrateQueue:
 56    """Builds a queue of batches from a markdown document containing <A, B, C, context_id> placeholders.
 57
 58    Takes a markdown document string and builds a queue by replacing <A, B, C, context_id>
 59    with actual content in batches. A determines the batch order where lower
 60    numbers come first and equal numbers are filled concurrently. B is the
 61    inclusive lower bound for number of characters allowed in the replacement
 62    text. C is the inclusive upper bound.
 63
 64    Each batch depends on the previous batch's results, so the document is
 65    updated between batches. Members of the same batch do not depend on each
 66    other's completed responses.
 67    """
 68
 69    def __init__(self, markdown: str):
 70        """Initialize the queue from a markdown document containing placeholders.
 71
 72        Parses all `<A, B, C, context_id>` placeholders in the document, replaces them with
 73        unique markers, and prepares the batch processing queue sorted by batch
 74        number.
 75
 76        Args:
 77            markdown: Markdown document string containing zero or more `<A, B, C, context_id>`
 78                placeholders. Placeholders are regex-matched as `<int, int, int>` or
 79                `<int, int, int, identifier>`.
 80        """
 81        self._original_markdown = markdown
 82        self._tasks = self._parse_tasks()
 83        self._current_markdown = self._inject_markers(markdown)
 84        self._batch_numbers = sorted(set(t.batch for t in self._tasks))
 85        self._batch_index = 0
 86
 87    def _parse_tasks(self) -> List[HydrationTask]:
 88        """Extract and parse all placeholders from the original markdown.
 89
 90        Scans the document for `<A, B, C, context_id>` patterns and creates a `HydrationTask`
 91        for each, assigning a unique sequential marker like `<<TASK_0>>`.
 92
 93        Returns:
 94            List of `HydrationTask` objects in order of appearance in the document.
 95        """
 96        tasks = []
 97        for i, match in enumerate(PLACEHOLDER_PATTERN.finditer(self._original_markdown)):
 98            batch = int(match.group(1))
 99            min_chars = int(match.group(2))
100            max_chars = int(match.group(3))
101            context_id = match.group(4)  # None if not present
102            marker = f"<<TASK_{i}>>"
103            tasks.append(HydrationTask(batch, min_chars, max_chars, match.group(0), marker, context_id))
104        return tasks
105
106    def _inject_markers(self, markdown: str) -> str:
107        """Replace each placeholder with its unique marker, working right-to-left
108        so earlier offsets remain valid."""
109        result = markdown
110        matches = list(PLACEHOLDER_PATTERN.finditer(markdown))
111        for task, match in reversed(list(zip(self._tasks, matches))):
112            result = result[:match.start()] + task.marker + result[match.end():]
113        return result
114
115    @property
116    def done(self) -> bool:
117        """Check whether all batches have been processed.
118
119        Returns:
120            True if all batches have been processed and no more work remains,
121            False otherwise.
122        """
123        return self._batch_index >= len(self._batch_numbers)
124
125    @property
126    def current_batch_number(self) -> int | None:
127        """Get the batch number currently being processed.
128
129        Returns:
130            The integer batch number (A value from `<A, B, C, context_id>`) for the current
131            batch, or None if all batches are complete.
132        """
133        if self.done:
134            return None
135        return self._batch_numbers[self._batch_index]
136
137    def next_batch(self) -> List[Tuple[Document, int, int, str | None]]:
138        """Returns the next batch as a list of (Document, min_chars, max_chars, context_id).
139
140        For each task in the current batch, produces a Document where:
141        - The current task's placeholder is replaced with <TODO>
142        - All other unresolved placeholders are replaced with (will be filled later)
143        """
144        if self.done:
145            raise StopIteration("All batches have been processed.")
146
147        batch_num = self._batch_numbers[self._batch_index]
148        batch_tasks = [t for t in self._tasks if t.batch == batch_num]
149
150        marker_pattern = re.compile(r'<<TASK_\d+>>')
151
152        results = []
153        for task in batch_tasks:
154            md = self._current_markdown
155            # Replace the current task's marker with <TODO>
156            md = md.replace(task.marker, '<TODO>')
157            # Replace all other remaining markers with (will be filled later)
158            md = marker_pattern.sub('(will be filled later)', md)
159            doc = load_markdown(md, check_todo=True)
160            results.append((doc, task.min_chars, task.max_chars, task.context_id))
161
162        return results
163
164    def submit_results(self, results: List[str]) -> None:
165        """Accepts results for the current batch and advances to the next.
166
167        Args:
168            results: A list of replacement strings, one per task in the current
169                     batch, in the same order returned by next_batch().
170        """
171        if self.done:
172            raise StopIteration("All batches have been processed.")
173
174        batch_num = self._batch_numbers[self._batch_index]
175        batch_tasks = [t for t in self._tasks if t.batch == batch_num]
176
177        if len(results) != len(batch_tasks):
178            raise ValueError(
179                f"Expected {len(batch_tasks)} results for batch {batch_num}, "
180                f"got {len(results)}."
181            )
182
183        for task, replacement in zip(batch_tasks, results):
184            self._current_markdown = self._current_markdown.replace(
185                task.marker, replacement
186            )
187
188        self._batch_index += 1
189
190    @property
191    def current_markdown(self) -> str:
192        """Get the current state of the markdown document.
193
194        As batches are processed and results submitted, this property reflects
195        the progressively hydrated document with completed placeholders replaced
196        by their generated text.
197
198        Returns:
199            The markdown string with all processed placeholders replaced and
200            any remaining placeholders still represented by their unique markers.
201        """
202        return self._current_markdown

Builds a queue of batches from a markdown document containing placeholders.

Takes a markdown document string and builds a queue by replacing with actual content in batches. A determines the batch order where lower numbers come first and equal numbers are filled concurrently. B is the inclusive lower bound for number of characters allowed in the replacement text. C is the inclusive upper bound.

Each batch depends on the previous batch's results, so the document is updated between batches. Members of the same batch do not depend on each other's completed responses.

HydrateQueue(markdown: str) View Source

69    def __init__(self, markdown: str):
70        """Initialize the queue from a markdown document containing placeholders.
71
72        Parses all `<A, B, C, context_id>` placeholders in the document, replaces them with
73        unique markers, and prepares the batch processing queue sorted by batch
74        number.
75
76        Args:
77            markdown: Markdown document string containing zero or more `<A, B, C, context_id>`
78                placeholders. Placeholders are regex-matched as `<int, int, int>` or
79                `<int, int, int, identifier>`.
80        """
81        self._original_markdown = markdown
82        self._tasks = self._parse_tasks()
83        self._current_markdown = self._inject_markers(markdown)
84        self._batch_numbers = sorted(set(t.batch for t in self._tasks))
85        self._batch_index = 0

Initialize the queue from a markdown document containing placeholders.

Parses all <A, B, C, context_id> placeholders in the document, replaces them with unique markers, and prepares the batch processing queue sorted by batch number.

Args: markdown: Markdown document string containing zero or more <A, B, C, context_id> placeholders. Placeholders are regex-matched as <int, int, int> or <int, int, int, identifier>.

done: bool View Source

115    @property
116    def done(self) -> bool:
117        """Check whether all batches have been processed.
118
119        Returns:
120            True if all batches have been processed and no more work remains,
121            False otherwise.
122        """
123        return self._batch_index >= len(self._batch_numbers)

Check whether all batches have been processed.

Returns: True if all batches have been processed and no more work remains, False otherwise.

current_batch_number: int | None View Source

125    @property
126    def current_batch_number(self) -> int | None:
127        """Get the batch number currently being processed.
128
129        Returns:
130            The integer batch number (A value from `<A, B, C, context_id>`) for the current
131            batch, or None if all batches are complete.
132        """
133        if self.done:
134            return None
135        return self._batch_numbers[self._batch_index]

Get the batch number currently being processed.

Returns: The integer batch number (A value from <A, B, C, context_id>) for the current batch, or None if all batches are complete.

def next_batch(self) -> List[Tuple[Document, int, int, str | None]]: View Source

137    def next_batch(self) -> List[Tuple[Document, int, int, str | None]]:
138        """Returns the next batch as a list of (Document, min_chars, max_chars, context_id).
139
140        For each task in the current batch, produces a Document where:
141        - The current task's placeholder is replaced with <TODO>
142        - All other unresolved placeholders are replaced with (will be filled later)
143        """
144        if self.done:
145            raise StopIteration("All batches have been processed.")
146
147        batch_num = self._batch_numbers[self._batch_index]
148        batch_tasks = [t for t in self._tasks if t.batch == batch_num]
149
150        marker_pattern = re.compile(r'<<TASK_\d+>>')
151
152        results = []
153        for task in batch_tasks:
154            md = self._current_markdown
155            # Replace the current task's marker with <TODO>
156            md = md.replace(task.marker, '<TODO>')
157            # Replace all other remaining markers with (will be filled later)
158            md = marker_pattern.sub('(will be filled later)', md)
159            doc = load_markdown(md, check_todo=True)
160            results.append((doc, task.min_chars, task.max_chars, task.context_id))
161
162        return results

Returns the next batch as a list of (Document, min_chars, max_chars, context_id).

For each task in the current batch, produces a Document where:

The current task's placeholder is replaced with
All other unresolved placeholders are replaced with (will be filled later)

def submit_results(self, results: List[str]) -> None: View Source

164    def submit_results(self, results: List[str]) -> None:
165        """Accepts results for the current batch and advances to the next.
166
167        Args:
168            results: A list of replacement strings, one per task in the current
169                     batch, in the same order returned by next_batch().
170        """
171        if self.done:
172            raise StopIteration("All batches have been processed.")
173
174        batch_num = self._batch_numbers[self._batch_index]
175        batch_tasks = [t for t in self._tasks if t.batch == batch_num]
176
177        if len(results) != len(batch_tasks):
178            raise ValueError(
179                f"Expected {len(batch_tasks)} results for batch {batch_num}, "
180                f"got {len(results)}."
181            )
182
183        for task, replacement in zip(batch_tasks, results):
184            self._current_markdown = self._current_markdown.replace(
185                task.marker, replacement
186            )
187
188        self._batch_index += 1

Accepts results for the current batch and advances to the next.

Args: results: A list of replacement strings, one per task in the current batch, in the same order returned by next_batch().

current_markdown: str View Source

190    @property
191    def current_markdown(self) -> str:
192        """Get the current state of the markdown document.
193
194        As batches are processed and results submitted, this property reflects
195        the progressively hydrated document with completed placeholders replaced
196        by their generated text.
197
198        Returns:
199            The markdown string with all processed placeholders replaced and
200            any remaining placeholders still represented by their unique markers.
201        """
202        return self._current_markdown

Get the current state of the markdown document.

As batches are processed and results submitted, this property reflects the progressively hydrated document with completed placeholders replaced by their generated text.

Returns: The markdown string with all processed placeholders replaced and any remaining placeholders still represented by their unique markers.

async def hydrate_item( doc: Document, min_chars: int, max_chars: int, context: str = '', model: str = 'gpt-4o', task_context: str = '') -> tuple[str, float]: View Source

33async def hydrate_item(doc: Document, min_chars: int, max_chars: int, context: str = "", model: str = "gpt-4o", task_context: str = "") -> tuple[str, float]:
34    """Resolves a single <TODO> placeholder in a Document.
35
36    Calls the responder to fill in the <TODO>, then uses text_morpher
37    to adjust the result to fit within the character bounds [min_chars, max_chars].
38
39    Args:
40        doc: A Document with exactly one <TODO> placeholder.
41        min_chars: Inclusive lower bound for replacement character count.
42        max_chars: Inclusive upper bound for replacement character count.
43        context: Optional global context/instructions to include alongside
44                 the document preview.
45        model: The LLM model to use for text generation.
46        task_context: Optional per-task context text to include between the
47                      global context and the document preview.
48
49    Returns:
50        A tuple of (replacement_string, elapsed_ms).
51
52    Raises:
53        RuntimeError: If the responder returns no result or text morphing fails.
54    """
55    start = time.time()
56
57    preview = doc.preview()
58    parts = [p for p in [context, task_context, preview] if p]
59    content = "\n\n".join(parts)
60    message = HumanMessage(content=content)
61    response = await todo_injector(message, model=model)
62
63    if response is None:
64        raise RuntimeError("Responder returned no result for <TODO> placeholder.")
65
66    text = response.text
67
68    if min_chars <= len(text) <= max_chars:
69        elapsed_ms = (time.time() - start) * 1000
70        return text, elapsed_ms
71
72    # Can expand metadata here later if needed
73    success, morphed_text, _, _, _ = simple_morph(
74        text=text,
75        max_chars=max_chars,
76        min_chars=min_chars,
77        max_retries=3,
78        model=model,
79    )
80
81    if not success:
82        raise RuntimeError(
83            f"Text morphing failed. Got {len(morphed_text)} chars, "
84            f"needed [{min_chars}, {max_chars}]."
85        )
86
87    elapsed_ms = (time.time() - start) * 1000
88    return morphed_text, elapsed_ms

Resolves a single placeholder in a Document.

Calls the responder to fill in the , then uses text_morpher to adjust the result to fit within the character bounds [min_chars, max_chars].

Args: doc: A Document with exactly one placeholder. min_chars: Inclusive lower bound for replacement character count. max_chars: Inclusive upper bound for replacement character count. context: Optional global context/instructions to include alongside the document preview. model: The LLM model to use for text generation. task_context: Optional per-task context text to include between the global context and the document preview.

Returns: A tuple of (replacement_string, elapsed_ms).

Raises: RuntimeError: If the responder returns no result or text morphing fails.

def simple_morph( text, max_chars, min_chars, max_retries, model: str = 'gpt-4o') -> list[bool, str, int]: View Source

47def simple_morph(text, max_chars, min_chars, max_retries, model: str = "gpt-4o") -> list[bool, str, int]:
48    """A simple morph function that runs the TextMorphGraph.
49
50    Args:
51        text (str): The text to be morphed.
52        max_chars (int): The maximum target character length.
53        min_chars (int): The minimum target character length.
54        max_retries (int): The maximum number of retries for LLM calls.
55
56    Returns:
57        list[bool, str, int, int, float]: A list containing:
58            - success (bool): Whether the morphing was successful.
59            - morphed_text (str): The morphed text.
60            - total_calls (int): The total number of LLM calls made.
61            - num_characters (int): The number of characters in the morphed text.
62            - elapsed_ms (float): The time taken in milliseconds.
63
64    Raises:
65        Exception: If an error occurs during the morphing process.
66    """
67
68    start = time.time()
69
70    try:
71        result: TextMorphState = TextMorphGraph.invoke(TextMorphState(
72            text=text,
73            target_chars=(min_chars, max_chars),
74            messages=[],
75            responses=[],
76            model=model,
77            max_retries=max_retries,
78            success=False
79        ))
80
81    except Exception as e:
82        print(f"Error during text morphing: {e}")
83        raise e
84
85    elapsed_ms = (time.time() - start) * 1000
86
87    total_calls = max_retries - result['max_retries'] + 1
88
89    if result['responses'][-1] == text:
90        # No initial attempt
91        total_calls -= 1
92
93    if not result['success']:
94        return [False, text, total_calls, len(text), elapsed_ms]
95    else:
96        return [True, result['responses'][-1], total_calls, len(result['responses'][-1]), elapsed_ms]

A simple morph function that runs the TextMorphGraph.

Args: text (str): The text to be morphed. max_chars (int): The maximum target character length. min_chars (int): The minimum target character length. max_retries (int): The maximum number of retries for LLM calls.

Returns: list[bool, str, int, int, float]: A list containing: - success (bool): Whether the morphing was successful. - morphed_text (str): The morphed text. - total_calls (int): The total number of LLM calls made. - num_characters (int): The number of characters in the morphed text. - elapsed_ms (float): The time taken in milliseconds.

Raises: Exception: If an error occurs during the morphing process.