
    hM<                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ erddlmZ ddlmZ  ej8                  e      Ze G d d             Z eddd       G d de             Z y)zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGIteratorListOptionalSequence)
deprecated)Document)batch_iterate)BaseBlobParser)Blob)get_client_info)	OperationDocumentProcessorServiceClientc                   &    e Zd ZU dZeed<   eed<   y)DocAIParsingResultsz/Dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__     p/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/parsers/docai.pyr   r      s    9r   r   z0.0.32z1.0z&langchain_google_community.DocAIParser)sinceremovalalternative_importc                      e Zd ZdZdddddded   dee   dee   dee   fd	Zd
edee	   fdZ
	 	 	 d"d
ededee   deee      dee	   f
dZ	 	 	 d#dee   dee   dededee	   f
dZdee   dee	   fdZdee   ded   fdZded   defdZdddddddee   dee   dee   dededee   ded   fd Zded   dee   fd!Zy)$DocAIParserz`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)clientlocationgcs_output_pathprocessor_namer$   r   r%   r&   r'   c                h   t        |      t        |      k(  rt        d      d}|r%t        j                  ||      st        d| d      || _        || _        |r|| _        y		 ddlm} ddl	m
}  || d
      }	 ||	t        d            | _        y	# t        $ r}t        d      |d	}~ww xY w)a  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Zdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)api_endpointzdocument-ai)module)client_optionsclient_info)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientgoogle.api_core.client_optionsr)   google.cloud.documentair   ImportErrorr   )
selfr$   r%   r&   r'   patternr)   r   excoptionss
             r   __init__zDocAIParser.__init__2   s    * <4>) 
 U",,w"G!.!1 2   !0-!DLHR $ (z)CDG :&+=ADL  != s   %B 	B1 B,,B1blobreturnc              #   \   K   | j                  |g| j                        E d{    y7 w)zParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        r&   N)batch_parser3   )r9   r>   s     r   
lazy_parsezDocAIParser.lazy_parsel   s)      ##TFD<Q<Q#RRRs   ",*,Tenable_native_pdf_parsing
field_mask
page_rangec           
   #     K   	 ddl m} ddlm}m}m} 	 ddlm |r	 ||      nd}
|r	 ||	      nd}| j                  j                  |j                  | j                  |j                  j                  j                  xs d
       ||
|      d|            fdj                   j"                  D        E d{    y# t        $ r}	t        d      |	d}	~	ww xY w# t        $ r}	t        d      |	d}	~	ww xY w7 ?w)a  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   
documentai)IndividualPageSelector	OcrConfigProcessOptionsr*   N_text_from_layoutjdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`rD   )pagesapplication/pdfgcs_uri	mime_type)
ocr_configindividual_page_selectorT)namegcs_documentprocess_optionsskip_human_reviewrE   c              3      K   | ]M  }t         |j                  j                  j                        |j                  j
                  d        O yw)pagesource)page_contentmetadataN)r
   layoutdocumenttextpage_numberpath).0r^   rN   r>   responses     r   	<genexpr>z-DocAIParser.online_process.<locals>.<genexpr>   sT      	
  .t{{H<M<M<R<RS ,,"ii 	
s   AA)google.cloudrI    google.cloud.documentai_v1.typesrJ   rK   rL   r8   -google.cloud.documentai_toolbox.wrappers.pagerN   r5   process_documentProcessRequestr4   GcsDocumentrf   mimetyperc   rQ   )r9   r>   rD   rE   rF   rI   rJ   rK   rL   r;   rV   rW   rN   rh   s    `          @@r   online_processzDocAIParser.online_processw   s<    $	/ 	W ) 0IJ 	 9C"4 	! <<00%%))'33 II"mm@/@ 4  !/)-E! #'% & 
	
 !))//	
 		
 		
I  	9 	  	A 	:		
sQ   DC C) B(DDD	C&C!!C&&D)	D2C>>DDblobstimeout_seccheck_in_interval_secc              #     K   |xs | j                   }|st        d      | j                  ||      }|D cg c]  }|j                  j                   }}t
        j                  d|       d}	| j                  |      rUt        j                  |       |	|z  }	|	|kD  rt        d| d      t
        j                  d       | j                  |      rU| j                  |      }
| j                  |
      E d	{    y	c c}w 7 
w)
a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        :An output path on Google Cloud Storage should be provided.rA   z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r3   r0   docai_parse	operationrX   loggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)r9   rr   r&   rs   rt   output_pathrx   opoperation_namestime_elapsedresultss              r   rB   zDocAIParser.batch_parse   s    4 &>)>)>L  %%e[%I
7AB2<<,,BBG	
 ooj)JJ,-11Lk)"9/9J'R  LL ooj) ""j"9**7333 C 	4s#   5DD A?D&D:D;Dr   c              #      	K   	 ddl m} ddlm} ddlm |D ]4  	 |	j                        \  }} |||      }	fd|D        E d {    6 y # t        $ r}t        d      |d }~ww xY w7 $w)Nr   )split_gcs_uri)_get_shardsrM   rO   c              3      K   | ]T  }|j                   D ]C  }t         |j                  |j                        |j                  j
                  d        E V ywr]   )rQ   r
   rb   rd   re   r   )rg   shardr^   rN   results      r   ri   z1DocAIParser.parse_from_results.<locals>.<genexpr>  sf      
 !KK  !24;;

!K&*&6&6&BTBTU s   AA)7google.cloud.documentai_toolbox.utilities.gcs_utilitiesr   1google.cloud.documentai_toolbox.wrappers.documentr   rl   rN   r8   r   )
r9   r   r   r   r;   gcs_bucket_name
gcs_prefixshardsrN   r   s
           @@r   r   zDocAIParser.parse_from_results   s     
	 VW  
	F*78J8J*K'OZ *=F
 $  
	  	A 	s3   A1A 2A1
A/A1	A,A''A,,A1r   r   c                     	 ddl m} |D cg c]%  }| j                  j	                   ||            ' c}S # t        $ r}t        d      |d}~ww xY wc c}w )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`N)rX   )request)!google.longrunning.operations_pb2r   r8   r5   get_operation)r9   r   r   r;   rX   s        r   operations_from_namesz!DocAIParser.operations_from_names
  sn    	 (
 LL&&/B/M&N
 	
  	: 	
s   9 *A	AAArx   c                 &    t        d |D              S )Nc              3   >   K   | ]  }|j                            y w)N)done)rg   r   s     r   ri   z)DocAIParser.is_running.<locals>.<genexpr>  s     6Rrwwy=6s   )any)r9   rx   s     r   r}   zDocAIParser.is_running  s    6:666r   i  )r&   r'   
batch_sizerD   rE   r   c                   	 ddl m} ddlm}m}	 |xs | j                  }|t        d      |xs | j                  }|t        d      g }t        ||      D ]  }|j                  |j                  |D cg c]-  }|j                  |j                  |j                  xs d	
      / c}            }|j                  |j                  j!                  ||            }|r |	 ||            nd}|j#                  | j$                  j'                  |j)                  ||||d                    |S # t
        $ r}
t        d      |
d}
~
ww xY wc c}w )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   rH   )rK   rL   r*   Nrv   z0A Document AI processor name should be provided.)sizeiterablerR   rS   )	documents)gcs_documents)rT   rE   )gcs_output_configrP   )rV   T)rX   input_documentsdocument_output_configrZ   r[   )rj   rI   rk   rK   rL   r8   r3   r0   r4   r   BatchDocumentsInputConfigGcsDocumentsro   rf   rp   DocumentOutputConfigGcsOutputConfigappendr5   batch_process_documentsBatchProcessRequest)r9   rr   r&   r'   r   rD   rE   rI   rK   rL   r;   r   rx   batchr>   input_configoutput_configrZ   s                     r   ry   zDocAIParser.docai_parse  s   6	/R &>)>)>L  (?4+?+?!OPP
"
UC &	E%??(55 %*
 !	 #..$(II&*mm&H7H /  6  @ 
L ';;","A"A"Q"Q'J #R # < M - (2K    4422+(4/<(7*. 3 
9&	N o  	9 	&s   D8 62E8	EEEc           	      x   	 ddl m} |D cg c]  }t        |j                  |      r|j                  j
                  n.|j                  |j                  j                        j
                  D ]#  }t        |j                  |j                        %  c}}S # t        $ r}t        d      |d }~ww xY wc c}}w )Nr   )BatchProcessMetadatar*   )r   r   )google.cloud.documentai_v1r   r8   
isinstancera   individual_process_statusesdeserializevaluer   input_gcs_sourceoutput_gcs_destination)r9   rx   r   r;   r   statuss         r   r   zDocAIParser.get_resultsu  s    	G !

  bkk+?@ 77)55KK%%--
   "33"99

 	
  	9 	
s   B BB6	B3"B..B3)TNN)Ni  <   )r   r   r   r   r   r   r=   r   r   r
   rC   r/   r   intrq   r   rB   r   r   r   r}   ry   r   r   r   r   r#   r#   &   s    >B"&)-(,8 9:8 3-	8
 "#8 !8t	St 	S(: 	S +/$(*.F
F
 $(F
 SM	F

 T#Y'F
 
(	F
V *.%'/4~/4 "#/4 	/4
  #/4 
(	/4b/0	(	4
T#Y 
4CT 
"7T+%6 74 7 *.(,*.$(U~U "#	U
 !U U $(U SMU 
k	Un
d;&7 
DAT<U 
r   r#   )!r   loggingr1   r~   dataclassesr   typingr   r   r   r   r   langchain_core._api.deprecationr	   langchain_core.documentsr
   langchain_core.utils.iterr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   &langchain_community.utilities.vertexair   google.api_core.operationr   r7   r   	getLoggerr   r{   r   r#   r   r   r   <module>r      s     	  ! D D 6 - 3 D B B3F 
		8	$    
?
`
. `

`
r   