docling_parse_docs

autogen.agents.experimental.document_agent.docling_parse_docs #

docling_parse_docs(input_file_path, output_dir_path=None, output_formats=None)

Convert documents into a Deep Search document format using EasyOCR with CPU only, and export the document and its tables to the specified output directory.

Supported formats

PDF, IMAGE, DOCX, HTML, PPTX, ASCIIDOC, MD,

PARAMETER	DESCRIPTION
`input_file_path`	The path to the input file. TYPE: `Union[Path, str]`
`output_dir_path`	The path to the output directory. TYPE: `Union[Path, str]` DEFAULT: `None`
`output_formats`	The output formats. Defaults to ["markdown"]. TYPE: `list[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Path]`	list[ConversionResult]: The result of the conversion.

Source code in autogen/agents/experimental/document_agent/parser_utils.py

@require_optional_import(["docling"], "rag")
@export_module("autogen.agents.experimental.document_agent")
def docling_parse_docs(  # type: ignore[no-any-unimported]
    input_file_path: Union[Path, str],
    output_dir_path: Optional[Union[Path, str]] = None,
    output_formats: Optional[list[str]] = None,
) -> list[Path]:
    """Convert documents into a Deep Search document format using EasyOCR
    with CPU only, and export the document and its tables to the specified
    output directory.

    Supported formats:
        PDF,
        IMAGE,
        DOCX,
        HTML,
        PPTX,
        ASCIIDOC,
        MD,

    Args:
        input_file_path (Union[Path, str]): The path to the input file.
        output_dir_path (Union[Path, str]): The path to the output directory.
        output_formats (list[str], optional): The output formats. Defaults to ["markdown"].

    Returns:
        list[ConversionResult]: The result of the conversion.
    """
    output_dir_path = output_dir_path or Path("./output")
    output_formats = output_formats or ["markdown"]

    input_doc_paths: list[Path] = handle_input(input_file_path, output_dir=output_dir_path)

    if not input_doc_paths:
        raise ValueError("No documents found.")

    # Docling Parse PDF with EasyOCR (CPU only)
    # ----------------------
    pdf_pipeline_options = PdfPipelineOptions()
    pdf_pipeline_options.do_ocr = True
    if hasattr(pdf_pipeline_options.ocr_options, "use_gpu"):
        pdf_pipeline_options.ocr_options.use_gpu = False  # <-- set this.
    pdf_pipeline_options.do_table_structure = True
    pdf_pipeline_options.table_structure_options.do_cell_matching = True
    pdf_pipeline_options.ocr_options.lang = ["en"]
    pdf_pipeline_options.accelerator_options = AcceleratorOptions(num_threads=4, device=AcceleratorDevice.AUTO)

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
        },
    )

    start_time = time.time()
    conv_results = list(doc_converter.convert_all(input_doc_paths))
    end_time = time.time() - start_time

    logger.info(f"Document converted in {end_time:.2f} seconds.")

    # Export results
    output_dir = Path(output_dir_path)
    output_dir.mkdir(parents=True, exist_ok=True)

    conv_files = []

    for res in conv_results:
        out_path = Path(output_dir_path)
        doc_filename = res.input.file.stem
        logger.debug(f"Document {res.input.file.name} converted.\nSaved markdown output to: {out_path!s}")
        logger.debug(res.document._export_to_indented_text(max_text_len=16))

        if "markdown" in output_formats:
            # Export Docling document format to markdown:
            output_file = out_path / f"{doc_filename}.md"
            with output_file.open("w") as fp:
                fp.write(res.document.export_to_markdown())
                conv_files.append(output_file)

        if "json" in output_formats:
            # Export Docling document format to json
            output_file = out_path / f"{doc_filename}.json"
            with output_file.open("w") as fp:
                fp.write(json.dumps(res.document.export_to_dict()))
                conv_files.append(output_file)

        # Export tables
        for table_ix, table in enumerate(res.document.tables):
            # Save the table as html
            element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
            logger.debug(f"Saving HTML table to {element_html_filename}")
            with element_html_filename.open("w") as fp:
                fp.write(table.export_to_html())

    return conv_files