
    h                    2   d Z ddlmZ ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlZddlZdd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z% erddl&Z&ddl'Z'ddl(Z(ddl)Z)ddl*m+Z+ g dZ,g dZ-	 	 	 	 d)dZ. ej^                  e0      Z1dZ2dZ3dZ4dZ5h dZ6d*dZ7d+dZ8d+dZ9ddgZ:d,dZ; G d de       Z< G d de       Z= G d d e       Z> G d! d"e       Z? G d# d$e       Z@ G d% d&e       ZA G d' d(e       ZBy)-z(Module contains common parsers for PDFs.    )annotationsN)datetime)Path)TemporaryDirectory)TYPE_CHECKINGAnyBinaryIOIterableIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)BaseBlobParser)Blob)BaseImageBlobParserRapidOCRBlobParser)TextLinearizationConfig)	DCTDecodeDCT	JPXDecode)	LZWDecodeLZWFlateDecodeFlASCII85DecodeA85ASCIIHexDecodeAHxRunLengthDecodeRLCCITTFaxDecodeCCFJBIG2Decodec                    	 ddl m}  |       }d}| D ]6  } ||      \  }}|s|D cg c]  }|d   	 }}dj                  |      z  }8 |S # t        $ r t        d      w xY wc c}w )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime`    
)rapidocr_onnxruntimer*   ImportErrorjoin)imagesr*   ocrtextimgresult_s          n/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/parsers/pdf.py!extract_from_images_with_rapidocrr8   @   s    
1 *CD &H	*01$d1g1F1DIIf%%D	&
 K  
1
 	

 2s   A A&A#z

{image_text}

r-   z
>   sourcecreatorproducertotal_pagescreationdatec                    |rU| j                   xs d}|dk(  r|j                  dd      }d| d| d}|S |dk(  rd	t        j                  |d
       d| d}|S )a  Format the content of the image with the source of the blob.

    blob: The blob containing the image.
    format::
      The format for the parsed output.
      - "text" = return the content as is
      - "markdown-img" = wrap the content into an image markdown link, w/ link
      pointing to (`![body)(#)`]
      - "html-img" = wrap the content as the `alt` text of an tag and link to
      (`<img alt="{body}" src="#"/>`)
    #zmarkdown-img]z\\]z![z]()zhtml-imgz
<img alt="T)quotez src="z" />)r9   replacehtmlescape)blobcontentformatr9   s       r7   _format_inner_imagerI   i   sx     #^#ooc62G7)2fXQ/G N z!"4;;wd#C"DF6(RVWGN    c                    t         j                  | j                               st        d      t	        | j                  dd      t              st        d      | S )zValidate that the metadata has all the standard keys and the page is an integer.

    The standard keys are:
    - source
    - total_page
    - creationdate
    - creator
    - producer

    Validate that page is an integer if it is present.
    z3The PDF parser must valorize the standard metadata.pager   z(The PDF metadata page must be a integer.)_STD_METADATA_KEYSissubsetkeys
ValueError
isinstancegetint)metadatas    r7   _validate_metadatarU      sJ     &&x}}7NOOhll61-s3CDDOrJ   c                   i }ddd}| j                         D ]  \  }}t        |      t        t        fvrt        |      }|j	                  d      r|dd }|j                         }|dv r:	 t        j                  |j                  dd	      d
      j                  d      ||<   ||v r||||   <   |||<   t        |t              r|j                         ||<   t        |t              s|||<    |S # t        $ r |||<   Y w xY w)zPurge metadata from unwanted keys and normalize key names.

    Args:
        metadata: The original metadata dictionary.

    Returns:
        The cleaned and normalized the key format of metadata dictionary.
    r<   r9   )
page_count	file_path/r,   N)r=   moddate'r+   zD:%Y%m%d%H%M%S%zT)itemstypestrrS   
startswithlowerr   strptimerC   	isoformatrP   rQ   strip)rT   new_metadatamap_keykvs        r7   _purge_metadatari      s    $&L#G    173*$AA<<!"AGGI++$"*"3"3IIc2&(:#)C. Q
 '\'(L$LO3ggiLO3LO) *   $"#Q$s   +8C44DDz




c                    	 	 	 	 	 	 	 	 dfd | |d      }|s1d}dj                  t        d |             }|rt        d   |z   }||z   }|S )a5  Insert extras such as image/table in a text between two paragraphs if possible,
    else at the end of the text.

    Args:
        extras: List of extra content (images/tables) to insert.
        text_from_page: The text content from the page.

    Returns:
        The merged text with extras inserted.
    c                    | rwt         D ]j  }|j                  |      }|dk7  sd }|r 	| |d | d      }|r	|||d  z   }n3d}dj                  t        d |             }|r||z   }|d | |z   ||d  z   } |S  d }|S |}|S )NFr+   rj   c                    | S N xs    r7   <lambda>zO_merge_text_and_extras.<locals>._recurs_merge_text_and_extras.<locals>.<lambda>   s    ! rJ   )_PARAGRAPH_DELIMITERrfindr0   filter)
extrastext_from_pagerecursdelimposprevious_textall_text
all_extras
str_extras_recurs_merge_text_and_extrass
            r7   r   z=_merge_text_and_extras.<locals>._recurs_merge_text_and_extras   s     -  $**51"9$(M(E"N4C$8%) %#0>#$3G#G%'
%+[[V1L%M
%).);J*4C0:=st@TT ! 
 1 *    &HrJ   Tr+   rj   c                    | S ro   rp   rq   s    r7   rs   z(_merge_text_and_extras.<locals>.<lambda>   s    ! rJ   rm   )rw   	list[str]rx   r_   ry   boolreturnOptional[str])r0   rv   rt   )rw   rx   r}   r~   r   r   s        @r7   _merge_text_and_extrasr      sv    +.8<	< -V^TJH
[[V!<=
-b1J>J!J.OrJ   c                  h     e Zd ZdZ	 	 d
deddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZdd	Z xZS )PyPDFParsera  Parse a blob from a PDF using `pypdf` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images.
    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFParser

            parser = PyPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   plain)modepages_delimiterimages_parserimages_inner_formatextraction_modeextraction_kwargsc                   t         	|           |dvrt        d      || _        |r|s
t	               }|| _        || _        || _        || _        || _	        || _
        |xs i | _        y)u  Initialize a parser based on PyPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            extract_images: Whether to extract images from the PDF.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” extract text
                in a fixed width format that closely adheres to the rendered layout in
                the source pdf.
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Raises:
            ValueError: If the `mode` is not "single" or "page".
        singlerL   mode must be single or pageN)super__init__rP   extract_imagesr   r   r   passwordr   r   r   r   )
selfr   r   r   r   r   r   r   r   	__class__s
            r7   r   zPyPDFParser.__init__$  sv    J 	)):;;,-.0M*#6  	..!2!8brJ   c              #  @   K   	 ddl d fd}|j                         5 } j                  | j                        }t        dddd	t        t        |j                  xs i       z  |j                  t        |j                        d
z        }g }t        |j                        D ]  \  }} ||      }	 j                  |      }
t        |
g|	      j                         } j                   dk(  r,t#        |t%        |||j&                  |   dz               v|j)                  |         j                   dk(  r1t#         j*                  j-                  |      t%        |             ddd       y# t        $ r t        d      w xY w# 1 sw Y   yxY ww)m  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   NzE`pypdf` package not found, please install it with `pip install pypdf`rL   c                    j                   j                  d      r| j                         S  | j                  ddj                  ij                  S )z
            Extract text from image given the version of pypdf.

            Args:
                page: The page object to extract text from.

            Returns:
                str: The extracted text.
            3r   rp   )__version__r`   extract_textr   r   )rL   pypdfr   s    r7   _extract_text_from_pagez7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagem  sY       ++C0((**(t(( $($8$8,, rJ   r   PyPDFr+   r;   r:   r=   )r9   r<   )rL   )rL   
page_labelpage_contentrT   r   )rL   zpypdf.PageObjectr   r_   )r   r/   as_bytes_io	PdfReaderr   ri   r   dictrT   r9   lenpages	enumerateextract_images_from_pager   rd   r   r   rU   page_labelsappendr   r0   )r   rF   r   pdf_file_obj
pdf_readerdoc_metadatasingle_textspage_numberrL   rx   images_from_pager}   r   s   `           @r7   
lazy_parsezPyPDFParser.lazy_parseW  s    		$  #	<(NJ*$"MtZ006B78 #kk#&z'7'7#8L L%.z/?/?%@ 2!T!8d!C#'#@#@#F 1%&%'  99&"%-!3((3.8.D.D[.Q"	 	 !''1%2& yyH$!%!5!5!:!:<!H/= A#	 #	/  	W 	.#	 #	s3   FE: FEF1	F:FFFFc           	         | j                   syddl}ddlm} dt	        t
        |d         j                         vry|d   d   j                         }g }|D ]  }d}||   d   dk(  st        ||   d	         |j                  j                  j                  u r||   d	   d
d n||   d	   d   d
d }|t        v rX||   d   ||   d   }
}	t        j                  ||   j                         t        j                         j#                  |	|
d      }nf|t$        v rIt        j&                  |j)                  t+        j,                  ||   j                                           }nt.        j1                  d       |&t+        j,                         }|j3                         j4                  dk(  rY|j7                  |      j9                  |d       t;        j<                  |j?                         d      }tA        | j                   jC                  |            jD                  }|jG                  tI        ||| jJ                                tL        jO                  tP        jS                  tU        d|                  S )Extract images from a PDF page and get the text using images_to_text.

        Args:
            page: The page object from which to extract images.

        Returns:
            str: The extracted text from the images on the page.
        r+   r   NImagez/XObjectz
/Resourcesz/Subtypez/Imagez/Filterr,   z/Heightz/Widthdtyperm   Unknown PDF Filter!PNG)rH   z	image/png	mime_type
image_text)+r   r   PILr   r   r   rO   
get_objectr^   generic_base
NameObject_PDF_FILTER_WITHOUT_LOSSnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSarrayopenioBytesIOloggerwarning	getbuffernbytes	fromarraysaver   	from_datagetvaluenextr   r   r   rI   r   _FORMAT_IMAGE_STRrH   _JOIN_IMAGESr0   rv   )r   rL   r   r   xObjectr1   objnp_image
img_filterheightwidthimage_bytesrF   r   s                 r7   r   z$PyPDFParser.extract_images_from_page  sB    !!T$\(:;@@BB|$Z0;;= 	C Hs|J'83 GCL348K8K8V8VV CL+AB/ i03AB7 
 !99$+CL$;WS\(=SEF!}}--/rxx gfeR0   #88!xx

2::gcl>S>S>U3V(WXH NN#89'"$**,K",,.55: OOH-22;u2M>>+*>*>*@KXD!%d&8&8&C&CD&I!J!W!WJMM+D*d>V>VW9	> !''#((f)=> ( 
 	
rJ   NF)r   zOptional[Union[str, bytes]]r   r   r   Literal['single', 'page']r   r_   r   Optional[BaseImageBlobParser]r   +Literal['text', 'markdown-img', 'html-img']r   zLiteral['plain', 'layout']r   Optional[dict[str, Any]]rF   r   r   Iterator[Document])rL   zpypdf._page.PageObjectr   r_   )	__name__
__module____qualname____doc___DEFAULT_PAGES_DELIMITERr   r   r   __classcell__r   s   @r7   r   r      s    .d 15$19
 +177;KQ6=6:19-19 19
 (19 19 519 I19 419 419fKZ4
rJ   r   c                       e Zd ZdZdZ	 dddedddd	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZedd       Zedd	       Z		 	 d	 	 	 	 	 	 	 dd
Z
ddZ xZS )PDFMinerParsera  Parse a blob from a PDF using `pdfminer.six` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six pillow

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PDFMinerParser

            parser = PDFMinerParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNr   r3   )r   r   r   r   r   concatenate_pagesc               ,   t         |           |dvrt        d      |r|s
t               }|| _        || _        || _        || _        || _        || _	        |<t        j                  s dt        _        t        j                  d       |rdnd| _        yy)aH  Initialize a parser based on PDFMiner.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: Extraction mode to use. Either "single" or "page" for page-wise
                extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from PDF.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            concatenate_pages: Deprecated. If True, concatenate all PDF pages
                into one a single document. Otherwise, return one document per page.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the `mode` is not "single" or "page".

        Warnings:
            `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
            instead.
        r   r   NTzS`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'` instead.r   rL   )r   r   rP   r   r   r   r   r   r   r   r   _warn_concatenate_pagesr   r   )	r   r   r   r   r   r   r   r   r   s	           r7   r   zPDFMinerParser.__init__  s    P 	)):;;-.0M,*#6  	.(!999=6= %66DI )rJ   c                    ddl m t        | t              r!| j	                  d      rt        | dd dd      S 	 d | D        }d	j                  fd
|D              S # t        $ r t        |       cY S w xY w)z
        Decodes a PDFDocEncoding string to Unicode.
        Adds py3 compatibility to pdfminer's version.

        Args:
            s: The string to decode.

        Returns:
            str: The decoded Unicode string.
        r   )PDFDocEncodings      Nzutf-16beignorec              3  V   K   | ]!  }t        |t              rt        |      n| # y wro   )rQ   r_   ord).0cs     r7   	<genexpr>z-PDFMinerParser.decode_text.<locals>.<genexpr>]  s"     CAjC0CFa7Cs   ')r+   c              3  (   K   | ]	  }|     y wro   rp   )r   or   s     r7   r  z-PDFMinerParser.decode_text.<locals>.<genexpr>^  s     ;>!,;s   )pdfminer.utilsr   rQ   bytesr`   r_   r0   
IndexError)sordsr   s     @r7   decode_textzPDFMinerParser.decode_textL  sn     	2aALL$=quj(33	CCD77;d;;; 	q6M	s   "A A43A4c                   ddl m} t        | d      r| j                         } t	        | t
              r#t        t        t        j                  |             S t	        | |      rt        j                  | j                        S t	        | t        t        f      rt        j                  |       S t	        | t              r2| j                         D ]  \  }}t        j                  |      | |<    | S | S )z
        Recursively resolve the metadata values.

        Args:
            obj: The object to resolve and decode. It can be of any type.

        Returns:
            The resolved and decoded object.
        r   )	PSLiteralresolve)pdfminer.psparserr  hasattrr  rQ   listmapr   resolve_and_decoder	  namer_   r  r   r]   )r   r  rg   rh   s       r7   r  z!PDFMinerParser.resolve_and_decodeb  s     	03	"++-Cc4 N==sCDDY'!--chh77c5\*!--c22T"		 >1'::1=A>J
rJ   c           	        ddl m}m}m}  ||      } ||||      }i }	|j                  D ]  }
|	j                  |
        |	j                         D ]  \  }}	 t        j                  |      |	|<     t        t        |j                  |                  |	d<   |	S # t        $ r*}t        j                  d|t        |             Y d}~wd}~ww xY w)ag  
        Extract metadata from a PDF file.

        Args:
            fp: The file pointer to the PDF file.
            password: The password for the PDF file, if encrypted. Defaults to an empty
                string.
            caching: Whether to cache the PDF structure. Defaults to True.

        Returns:
            Metadata of the PDF file.
        r   )PDFDocumentPDFPage	PDFParser)r   cachingzD[WARNING] Metadata key "%s" could not be parsed due to exception: %sNr<   )pdfminer.pdfpager  r  r  infoupdater]   r   r  	Exceptionr   r   r_   r   r  create_pages)r   fpr   r  r  r  r  parserdocrT   r  rg   rh   es                 r7   _get_metadatazPDFMinerParser._get_metadata~  s    $ 	ED 2&8WEHH 	"DOOD!	"NN$ 	DAq
,??B	 #&d7+?+?+D&E"F   $F	 s   B	C$ C		Cc              #     K   	 ddl }ddlm} ddlm}mmm}m}m	m
 ddlm}m} ddlm}	 t!        |j"                        dk  rt%        d      	 |j'                         5 }
t)               5 |	j+                  |
 j,                  xs d
      } |       }t/        ddd
d j1                  |
 j,                  xs d
      z        }|j2                  |d<    G  fdd|      }t5        j6                          || || |                   }g }t9        |      D ]  \  }}j;                  d       j=                  d       |j?                  |       jA                         }|jC                         } jD                  dk(  r@j;                  d       j=                  d       tG        |tI        |d|iz               |jK                  d      r|dd }|jM                  |         jD                  dk(  r3 jN                  jQ                  |      }tG        |tI        |             ddd       ddd       y# t$        $ r t%        d	      w xY w# 1 sw Y   *xY w# 1 sw Y   yxY ww)a  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pdfminer.six` or `pillow` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)PDFLayoutAnalyzer)LAParamsLTContainerLTImageLTItemLTPageLTText	LTTextBox)PDFPageInterpreterPDFResourceManager)r  i:>4zThis parser is tested with pdfminer.six version 20201018 or later. Remove pdfminer, and install pdfminer.six with `pip uninstall pdfminer && pip install pdfminer.six`.zMpdfminer package not found, please install it with `pip install pdfminer.six`r+   r   PDFMinerr   r9   c                  N     e Zd Z	 	 d	 	 	 	 	 	 	 d fdZdfdZ xZS )*PDFMinerParser.lazy_parse.<locals>.Visitorc                *    t         |   |||       y )N)pagenolaparams)r   r   )r   rsrcmgrr1  r2  r   s       r7   r   z3PDFMinerParser.lazy_parse.<locals>.Visitor.__init__  s     G$WVh$OrJ   c           	     2    d	fd |       y )Nc                J   t        |       r| D ]
  } |        n+t        | 	      rj                  | j                                t        | 
      rj                  d       y t        |       rj                  rddlm}  |      }|j                  |       }t        j                  t              |z        }d|j                  d<   t        j                  j                  |            j                  }j                  t        ||j                               y y y )Nr-   r   )ImageWriterr?   r9   )rQ   writeget_textr   pdfminer.imager6  export_imager   	from_pathr   rT   r   r   r   rI   r   )itemchildr6  image_writerfilenamerF   r   r%  r&  r)  r*  renderr   tempdirtext_ios          r7   r@  zIPDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout.<locals>.render  s    %dK8)- . &u.'f5#MM$--/:%dI6#MM$/'g6#11 F/:7/C+7+D+DT+J'+~~d7mh6N'O:=h 7-1$($6$6$A$A$$G.""., !+ !($7(,j$:R:R%&!"  2" !rJ   )r<  r'  r   Nonerp   )
meltpager@  r%  r&  r)  r*  r   rA  rB  s
     @r7   receive_layoutz9PDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout  s    ! !8 6NrJ   )r,   N)r3  r,  r1  rS   r2  zOptional[LAParams]r   rC  )rE  r(  r   rC  )r   r   r   r   rF  r   )r   r%  r&  r)  r*  r   rA  rB  s   @r7   Visitorr/    sE     #$37	P/P  P 1	P
 P# #rJ   rG  )r2  rL   r   rm   r   ))pdfminerpdfminer.converterr#  pdfminer.layoutr$  r%  r&  r'  r(  r)  r*  pdfminer.pdfinterpr+  r,  r  r  rS   r   r/   r   r   	get_pagesr   ri   r!  r9   r   StringIOr   truncateseekprocess_pager   rd   r   r   rU   endswithr   r   r0   )r   rF   rI  r#  r$  r'  r(  r+  r,  r  r   r   r3  r   rG  visitor_for_allall_contentirL   r}   document_contentr%  r&  r)  r*  rA  rB  s   `                    @@@@@@r7   r   zPDFMinerParser.lazy_parse  sy    	<   R08''(83!L  4  O	<1C1E O	%%lT]]=Pb%QE(*G*'JPRS$$\DMM<OR$PQL &*[[L"&# &#+ &#P kkmG08:>O K$U+ 14  #Q,,T2"++-#>>+99&$$Q'LLO"%-!3LFA;4N!O 
  ((.#+CR=&&x0%1& yyH$#'#7#7#<#<[#I !1/= YO	 O	 O	  	2 	O	 O	 O	 O	sN   I;AI I;)I/4GI#:I/	I;I  I;#I,	(I//I84I;F)r   r   r   r   r   r   r   r_   r   r   r   r   r   zOptional[bool])r  zUnion[bytes, str]r   r_   )r   r   r   r   )r+   T)r  r	   r   r_   r  r   r   dict[str, Any]r   )r   r   r   r   r   r   r   staticmethodr	  r  r!  r   r   r   s   @r7   r   r     s    0d $  %:B #'*277;KQ,0:B:B  	:B
 (:B :B 5:B I:B *:Bx  *  < 	,, , 	,
 
,\yrJ   r   c            	           e Zd ZdZ ej
                         Z	 	 dddeddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZ		 d	 	 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Zdd
Z	 	 	 	 	 	 ddZddZ xZS )PyMuPDFParsera  Parse a blob from a PDF using `PyMuPDF` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyMuPDFParser

            parser = PyMuPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
                # extract_tables="markdown",
                # extract_tables_settings=None,
                # text_kwargs=None,
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   )r   r   r   r   r   extract_tablesextract_tables_settingsc                  t         
|           |dvrt        d      |r|dvrt        d      || _        || _        || _        |xs i | _        |r|s
t               }|| _        || _	        || _
        || _        |	| _        y)a  Initialize a parser based on PyMuPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract tables in a specific format, such as
                "csv", "markdown", or "html".
            extract_tables_settings: Optional dictionary of settings for customizing
                table extraction.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
            ValueError: If the extract_tables format is not "markdown", "html",
            or "csv".
        r   r   )markdownrD   csvzmode must be markdownN)r   r   rP   r   r   r   text_kwargsr   r   r   r   r\  r]  )r   ra  r   r   r   r   r   r   r\  r]  r   s             r7   r   zPyMuPDFParser.__init__a  s    V 	)):;;n4OO455	. &,"-.0M,#6 *,'>$rJ   c                $    | j                  |      S ro   )_lazy_parse)r   rF   s     r7   r   zPyMuPDFParser.lazy_parse  s    
 	
rJ   c              #    K   	 ddl }|xs | j                  }| j                  sNddlm}m}m}m} i ddddddddd	dd
|ddddd|ddddddd|d|dddddddddddd| _        t        j                  5  |j                         5 }|j                   |j                  |      }	n |j                  |d      }	|	j                  r|	j                  | j                          dddd| j#                  |	|      z  }
g }|	D ]k  }| j%                  |	||      j'                         }| j(                  dk(  r(t+        |t-        |
d|j.                  iz               [|j1                  |       m | j(                  d k(  r1t+        | j2                  j5                  |      t-        |
             ddd       ddd       y# t        $ r t        d      w xY w# 1 sw Y   *xY w# 1 sw Y   yxY ww)!a  Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.
            text_kwargs: Optional keyword arguments to pass to the `get_text` method.
                If provided at run time, it will override the default text_kwargs.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)DEFAULT_JOIN_TOLERANCEDEFAULT_MIN_WORDS_HORIZONTALDEFAULT_MIN_WORDS_VERTICALDEFAULT_SNAP_TOLERANCEclipvertical_strategylineshorizontal_strategyvertical_lineshorizontal_linessnap_tolerancesnap_x_tolerancesnap_y_tolerancejoin_tolerancejoin_x_tolerancejoin_y_toleranceedge_min_length   min_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerance)text_tolerancetext_x_tolerancetext_y_tolerancestrategy	add_lineszGpymupdf package not found, please install it with `pip install pymupdf`pdf)streamfiletypePyMuPDFr+   r   rL   r   r   )pymupdfra  r]  pymupdf.tablere  rf  rg  rh  r/   r[  _lockr   datar   is_encryptedauthenticater   _extract_metadata_get_page_contentrd   r   r   rU   numberr   r   r0   )r   rF   ra  r  re  rf  rg  rh  rX   r  r   full_contentrL   r}   s                 r7   rc  zPyMuPDFParser._lazy_parse  s    ,)	%9)9)9K// 0D0 (0 *7	0
 %d0 '0 %&<0 '0 '0 %&<0 '0 '0 &q0 )*D0 +,H0  -a!0" /#0$ /%0& '(()() $!%/0,>    	!!# y99$&',,y1C&',,i%HC##$$T]]3 )($&  **35	 6
  " 
6D#55c4MSSUHyyF*&)1%7 ,/D D&  %++H5
6 99("%)%9%9%>%>|%L!3L!A 5	 	  	- 	 	 	sN   G:A.G
 2G:G.D&G"9G.	G:
GG:"G+	'G..G73G:c                     |j                   di i | j                  |}| j                  ||      }| j                  |      }g }|r|j	                  |       |r|j	                  |       t        ||      }|S )a:  Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.
            blob: The blob being parsed.

        Returns:
            str: The text content of the page.
        rp   )r8  ra  _extract_images_from_page_extract_tables_from_pager   r   )	r   r  rL   ra  rx   r   tables_from_pagerw   r}   s	            r7   r  zPyMuPDFParser._get_page_content  s    " 'M)LD,<,<)L)LM99#tD99$?MM*+MM*+)&.ArJ   c                X   t        i ddd|j                  |j                  t        |      d|j                  D ci c]5  }t	        |j                  |   t
        t        f      r||j                  |   7 c}      }dD ]#  }||j                  v s|j                  |   ||<   % |S c c}w )zExtract metadata from the document and page.

        Args:
            doc: The PyMuPDF document object.
            blob: The blob being parsed.

        Returns:
            dict: The extracted metadata.
        r  r+   )r;   r:   r=   r9   rX   r<   )modDatecreationDate)ri   r9   r   rT   rQ   r_   rS   )r   r  rF   rg   rT   s        r7   r  zPyMuPDFParser._extract_metadata!  s     # )($&"kk!%#&s8 !\\!#,,q/C:> s||A&
" - 	.ACLL !ll1o	. s   :B'
c                4   | j                   syddl}|j                         }g }|D ]=  }| j                   s|d   } |j                  ||      }t	        j
                  |j                  t        j                        j                  |j                  |j                  d      }	t        j                         }
|
j                         j                  dk(  rt        j                   |
|	       t#        j$                  |
j'                         d      }t)        | j                   j+                  |            j,                  }|j/                  t1        ||| j2                               @ t4        j7                  t8        j;                  t=        d|                  S )	a	  Extract images from a PDF page and get the text using images_to_text.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.

        Returns:
            str: The extracted text from the images on the page.
        r+   r   Nr   rm   application/x-npyr   r   )r   r  
get_imagesPixmapr   r   samplesr   r   r   r   r   r   r   r   numpyr   r   r   r   r   r   r   r   rI   r   r   rH   r   r0   rv   )r   r  rL   r  img_listr1   r4   xrefpiximager   rF   r   s                r7   r  z'PyMuPDFParser._extract_images_from_pageA  sK    !!??$ 	C!!1v$gnnS$/ckkBJJJJ		2 !jjl((*11Q6

;.~~((*6I "$"4"4"?"?"EFSS
'j$:R:RS#	( !''#((f)=> ( 
 	
rJ   c           
        | j                   yddl}t         |j                  j                  |fi | j
                        }|r| j                   dk(  r1t        j                  |D cg c]  }|j                          c}      S | j                   dk(  rCt        j                  |D cg c]$  }|j                         j                  ddd      & c}      S | j                   dk(  rBt        j                  |D cg c]#  }|j                         j                  dd	      % c}      S t        d
| j                    d      yc c}w c c}w c c}w )zExtract tables from a PDF page.

        Args:
            page: The PyMuPDF page object.

        Returns:
            str: The extracted tables in the specified format.
        Nr+   r   r_  rD   F)headerindex	bold_rowsr`  )r  r  zextract_tables z not implemented)r\  r  r  tablefind_tablesr]  _JOIN_TABLESr0   to_markdown	to_pandasto_htmlto_csvrP   )r   rL   r  tables_listr  s        r7   r  z'PyMuPDFParser._extract_tables_from_pagek  sd    &%GMM%%dKd.J.JK
 ""j0#((;)W%%*;*;*=)WXX$$.#(( &1 " )11#("'&+ 2 	 	 $$-#(( &1
 "	 )00#("' 1   !%d&9&9%::JK  5 *Xs   &E&)E8(Er   )ra  r   r   r   r   r   r   r   r   r_   r   r   r   r   r\  z/Union[Literal['csv', 'markdown', 'html'], None]r]  r   r   rC  r   ro   )rF   r   ra  r   r   r   )r  pymupdf.DocumentrL   pymupdf.Pagera  rX  r   r_   )r  r  rF   r   r   r   )r  r  rL   r  r   r_   )rL   r  r   r_   )r   r   r   r   	threadingLockr  r   r   r   rc  r  r  r  r  r   r   s   @r7   r[  r[  (  s+   2l INNE 15$;?
 #'*077;KQJN<@;?-;? ;?
  ;? (;? ;? 5;? I;? H;? ":;? 
;?z
 15__
 ._ 
_B  $	
 
:@(
#(
+7(
	(
T,rJ   r[  c                       e Zd ZdZ ej
                         Z	 d	ddeddd	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fdZddZ	ddZ
 xZS )PyPDFium2Parserao  Parse a blob from a PDF using `PyPDFium2` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdfium2

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFium2Parser

            parser = PyPDFium2Parser(
                # password=None,
                mode="page",
                pages_delimiter="
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NrL   r3   )r   r   r   r   r   c                   t         |           |dvrt        d      || _        |r|s
t	               }|| _        || _        || _        || _        || _	        y)uk  Initialize a parser based on PyPDFium2.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” for experimental
                layout mode functionality
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
        r   r   N)
r   r   rP   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   s          r7   r   zPyPDFium2Parser.__init__  sa    L 	)):;;,-.0M*#6  	.rJ   c              #  T  K   	 ddl }t        j                  5  |j	                         5 }d}	  |j
                  || j                  d      }g }ddddt        |j                               z  }|j                  |d	<   t        |      |d
<   t        |      D ]  \  }}|j                         }	dj                  |	j                         j                               }
|	j!                          | j#                  |      }t%        |g|
      j'                         }|j!                          | j(                  dk(  r5|j+                  d      s|dz  }t-        |t/        i |d|i             |j1                  |        | j(                  dk(  r1t-        | j2                  j                  |      t/        |             |r|j!                          	 ddd       ddd       y# t        $ r t        d      w xY w# |r|j!                          w w xY w# 1 sw Y   BxY w# 1 sw Y   yxY ww)r   r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`T)r   	autoclose	PyPDFium2r+   r   r9   r<   r-   rL   r   r   )	pypdfium2r/   r  r  r   PdfDocumentr   ri   get_metadata_dictr9   r   r   get_textpager0   get_text_range
splitlinescloser  r   rd   r   rR  r   rU   r   r   )r   rF   r  rX   r   r  r   r   rL   	text_pagerx   image_from_pager}   s                r7   r   zPyPDFium2Parser.lazy_parse  sA    	 "" 4	+!!# 3+y!
1+!6!6!6!DMMT"J $&L %0#.(*$ (
(D(D(FG	$HL
 .2[[L*25j/L/-6z-B :)T$($5$5$7	)-%446AAC* ")*.*H*H*N#9,-~$%' ! 

99.#+#4#4T#: (D 0"*-5);%&*6%&(.%&*"#  )//95:8 yyH,&)-)=)=)B)B<)P%7%E 
 ""((*g3+4	+ 4	+  	+ 	v ""((* "e3+ 3+4	+ 4	+sa   H(G  H(HHFG8<HH	H( G55H(8HHH	HH%!H(c                   | j                   syddlm} t        |j	                  |j
                  f            }|syg }|D ]   }t        j                         }|j                         j                         }|j                  dk  rFt        j                  ||j                         j                                t        j                  |j                         d      }t!        | j                   j#                  |            j$                  }	|j'                  t)        ||	| j*                               |j-                           t.        j1                  t2        j5                  |            S )	r   r+   r   N)rv   rv  r  r   r   )r   pypdfium2.rawrawr  get_objectsFPDF_PAGEOBJ_IMAGEr   r   
get_bitmapto_numpysizer  r   r   r   r   r   r   r   r   rI   r   r  r   rH   r   r0   )
r   rL   pdfium_cr1   
str_imagesr  r   r   rF   text_from_images
             r7   r  z)PyPDFium2Parser._extract_images_from_pageR  s&    !!(d&&x/J/J.L&MN
 	E**,K'')224H}}q JJ{E$4$4$6$?$?$AB>>+"6"6"8DWXD"4#5#5#@#@#FGTTO#D/4;S;ST KKM	 !''<3D3DZ3P'QQrJ   rW  )r   r   r   r   r   r   r   r_   r   r   r   r   r   rC  r   )rL   zpypdfium2._helpers.page.PdfPager   r_   )r   r   r   r   r  r  r  r   r   r   r  r   r   s   @r7   r  r    s    0h INNE  %0/ #'*077;KQ0/0/  	0/
 (0/ 0/ 50/ I0/ 
0/dM+^RrJ   r  c                  F    e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 ddZd	dZd
dZd
dZy)PDFPlumberParserzParse `PDF` with `PDFPlumber`.Nc                p    	 ddl }|xs i | _        || _        || _        y# t        $ r t        d      w xY w)zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        r   NzEpillow package not found, please install it with `pip install pillow`)r   r/   ra  deduper   )r   ra  r  r   r   s        r7   r   zPDFPlumberParser.__init__v  sI    	
 ',",  	W 	s     5c              #  \  K   ddl }|j                         5 } |j                  |      }|j                  D cg c]  }t	        | j                  |      dz   | j                  |      z   t        |j                  |j                  |j                  dz
  t        |j                        dfi |j                  D ci c]6  }t        |j                  |         t        t        fv r||j                  |   8 c}       c}}E d{    ddd       yc c}w c c}}w 7 # 1 sw Y   yxY ww)Lazily parse the blob.r   Nr-   r,   )r9   rX   rL   r<   r   )
pdfplumberr   r   r   r   _process_page_contentr  r   r9   r   r   rT   r^   r_   rS   )r   rF   r  rX   r  rL   rg   s          r7   r   zPDFPlumberParser.lazy_parse  s#     	9!*//),C*  II'& % !%!;!;D!A"44T:"; "&*kk)-$($4$4q$8+.syy>	 &)\\ !#CLLO4c
B s||A.	  	 	 	 	sL   D,"D A>D7;D2D>D DD 
	D,DD  D)%D,c                    | j                   r* |j                         j                  di | j                  S  |j                  di | j                  S )z)Process the page content based on dedupe.rp   )r  dedupe_charsr   ra  )r   rL   s     r7   r  z&PDFPlumberParser._process_page_content  sJ    ;;34$$&33Gd6F6FGG t  44#3#344rJ   c                   ddl m} | j                  syg }|j                  D ]>  }|d   d   j                  t
        v r|d   d   dk(  rd|j                  t        j                  |j                  d|d   d	   |d   d
   f|d   j                               j                  d                   |j                  t        j                  |d   j                         t        j                        j                  |d   d
   |d   d	   d             |d   d   j                  t        v r$|j                  |d   j                                *t!        j"                  d       A t%        |      S )z8Extract images from page and get the text with RapidOCR.r   r   r+   r  FilterBitsPerComponentr,   1WidthHeightLr   rm   r   )r   r   r   r1   r  r   r   r   r   	frombytesr   convertr   r   r   r   warningswarnr8   )r   rL   r   r1   r4   s        r7   r  z*PDFPlumberParser._extract_images_from_page  sM   "";; 	5C8}X&++/GGx=!349MM!OO #!$Xw!7Xx9P Q #H 6 6 8 &gcl MMc(m&<&<&>bhhOWWM(3S]75KR
 Xx(--1FFc(m446734+	5. 188rJ   )NFF)ra  zOptional[Mapping[str, Any]]r  r   r   r   r   rC  r   )rL   zpdfplumber.page.Pager   r_   )r   r   r   r   r   r   r  r  rp   rJ   r7   r  r  s  sJ    ( 48$	-0- - 	-
 
-,:59rJ   r  c                  :    e Zd ZdZ	 	 ddd	 	 	 	 	 	 	 ddZddZy)	AmazonTextractPDFParsera  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    ``Document`` objects are returned with metadata that includes the ``source`` and
    a 1-based index of the page number in ``page``. Note that ``page`` represents
    the index of the result returned from Textract, not necessarily the as-written
    page number in the document.

    N)linearization_configc                  	 ddl }ddlmc m} || _        || _        |%|D cg c]  }|j                  |       c}| _        ng | _        ||| _        n$| j
                  j                  dddd      | _        |s	 ddl}|j                  d	      | _        y|| _        yc c}w # t        $ r t        d      w xY w# t        $ r t        d
      w xY w)a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   NTz# z## *)hide_figure_layouttitle_prefixsection_header_prefixlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.textractzRCould not import boto3 python package. Please install it with `pip install boto3`.)textractcallertextractor.entities.documententitiesdocumenttc
textractorTextract_Featurestextract_featuresr  r   r/   boto3clientboto3_textract_client)r   r  r  r  r  r  fr  s           r7   r   z AmazonTextractPDFParser.__init__  s    &	'==DG(DO ,5F*01B((+*& *,&#/,@),0OO,S,S'+!%*/(+	 -T -) -2\\*-E* *0D&E*  	< 	  !B s'   !B$ B>B$ <B< B$ $B9<Cc              #    K   |j                   rt        t        |j                               nd}|ra|j                  dk(  rR|j                  rF| j
                  j                  t        |j                         | j                  | j                        }n_| j
                  j                  |j                         | j                  | j
                  j                  j                  | j                        }| j                  j                  j                  |      }t        |j                         D ]>  \  }}t        |j#                  | j$                        |j&                  |dz   d       @ yw)	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        Ns3)input_documentfeaturesr  )r  r  	call_moder  )configr,   r9   rL   r   )pathr   r_   schemenetlocr  call_textractr  r  as_bytesTextract_Call_Mode
FORCE_SYNCr  r   r   r   r   r8  r  r9   )r   rF   url_parse_resulttextract_response_jsonr  idxrL   s          r7   r   z"AmazonTextractPDFParser.lazy_parseC  s*     8<yy8C		N3d  ''4/ ''%)WW%:%:"499~//&*&@&@ &; &" &*WW%:%:#}}//''44??&*&@&@	 &; &" ??++001GH"8>>2 	IC!]]$2K2K]L$(KKqA 	s   E+E-)NN)r  zOptional[Sequence[int]]r  zOptional[Any]r  z!Optional[TextLinearizationConfig]r   rC  r   )r   r   r   r   r   r   rp   rJ   r7   r  r    sN    0h 6: $=0
 CG=02=0 =0
 @=0 
=0~!rJ   r  c                  (    e Zd ZdZddZddZddZy)	DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.c                J    t        j                  d       || _        || _        y )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)r  r  r  model)r   r  r	  s      r7   r   z#DocumentIntelligenceParser.__init__k  s#    	
 
rJ   c              #     K   |j                   D ]]  }dj                  |j                  D cg c]  }|j                   c}      }t	        ||j
                  |j                  d      }| _ y c c}w w)N r  r   )r   r0   rk  rG   r   r9   r   )r   rF   r5   plinerG   ds          r7   _generate_docsz)DocumentIntelligenceParser._generate_docsw  sd      
	AhhAABG$"kkMMA G
	As   )A5A0
7A5c              #     K   |j                         5 }| j                  j                  | j                  |      }|j	                         }| j                  ||      }|E d{    ddd       y7 # 1 sw Y   yxY ww)r  N)r   r  begin_analyze_documentr	  r5   r  )r   rF   file_objpollerr5   docss         r7   r   z%DocumentIntelligenceParser.lazy_parse  ss       	8[[77

HMF]]_F&&tV4DOO	 	 	 	s/   A=AA1!A/"A1&	A=/A11A:6A=N)r  r   r	  r_   )rF   r   r5   r   r   r   r   )r   r   r   r   r   r  r   rp   rJ   r7   r  r  g  s    A
	rJ   r  )r1   z,Sequence[Union[Iterable[np.ndarray], bytes]]r   r_   )rF   r   rG   r_   rH   r_   r   r_   )rT   rX  r   rX  )rw   r   rx   r_   r   r_   )Cr   
__future__r   rD   r   loggingr  r  r   pathlibr   tempfiler   typingr   r   r	   r
   r   r   r   r   r   r   r   urllib.parser   r  r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   3langchain_community.document_loaders.parsers.imagesr   r   r  r  r   r  )textractor.data.text_linearization_configr   r   r   r8   	getLoggerr   r   r   r   r  r   rM   rI   rU   ri   rt   r   r   r   r[  r  r  r  r  rp   rJ   r7   <module>r!     sE   . "  	      '    "   - D B
 Q9  "8> 
		8	$* ! U ,&#N 
 2je
. e
PJ^ JZ
oN odVRn VRr[9~ [9|Sn Sl& &rJ   