
    hn                         d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ  e j                  e      Z G d de      Z G d d	e
      Zy)
    N)DictIteratorListUnion)Document)BaseBlobParser)Blobc                       e Zd ZdZy)ServerUnavailableExceptionz7Exception raised when the Grobid server is unavailable.N)__name__
__module____qualname____doc__     q/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain_community/document_loaders/parsers/grobid.pyr   r      s    Ar   r   c            	       \    e Zd ZdZ	 ddededdfdZdedededee   fd	Z	d
e
dee   fdZy)GrobidParserz)Load  article `PDF` files using `Grobid`.segment_sentencesgrobid_serverreturnNc                     || _         || _        	 t        j                  |       y # t        j                  j
                  $ r t        j                  d       t        w xY w)NzyGROBID server does not appear up and running,                 please ensure Grobid is installed and the server is running)	r   r   requestsget
exceptionsRequestExceptionloggererrorr   )selfr   r   s      r   __init__zGrobidParser.__init__   sZ    
 "3*	-LL'""33 	-LLM -,	-s	   & 9A	file_pathxml_datac              #     K   	 ddl m}  ||d      }|j                  d      }|j                  d      }|r|d   j                  }nd}g }	|D ]  }
|
j                  d      }|t        |
j                  d
            D ]  \  }}g }g }t        |j                  d            D ]  \  }}|j                  |j                         g }|j                  d      m|j                  d      j                  d      D ]9  }|j                  d      }|j                  |d   |d   |d   |d   |d   d       ; |j                  |       |du st        |      dkD  s|d   d   |d   d   }}|j                  t        |      |g|j                  |j                  d      ||fd}|	j                  |        |dus=|d   d   d   |d   d   d   }}dj                  |      t        |      ||j                  |j                  d      ||fd}|	j                  |         |	D cg c]~  }t        |d   t        t        |d         t        |d         t        |d         t        |d         t        |d         t        |d         t        |      t        |      d       !       c}E d	{    y	# t        $ r t        d      w xY wc c}w 7 "w)"z!Process the XML file from Grobin.r   )BeautifulSoupzA`bs4` package not found, please install it with `pip install bs4`xmldivtitlezNo title foundheadNpscoords;,            )pagexyhwTr2   n)textparabboxessection_titlesection_numberpages r9   r:   r;   r>   r<   r=   )r9   r:   r;   r>   r<   r=   paper_titler!   )page_contentmetadata)bs4r$   ImportErrorfind_allr9   find	enumerateappendr   splitlenstrjoinr   dict)r   r!   r"   r   r$   soupsectionstitlesr'   chunkssectionsecti	paragraphchunk_bboxesparagraph_textsentencesbboxesbboxboxfpagelpagesentence_dictparagraph_dictchunks                            r   process_xmlzGrobidParser.process_xml&   s    
	)
 Xu-=='w'1INNE$E .	6G<<'D$-g.>.>s.C$D +6LAy#%L%'N'01C1CC1H'I 98&--hmm<"$#<<1=(0X(>(D(DS(I 
"&*jjo '03A-0V-0V-0V-0V%&!"
" )//8-5CL1<L+21:f+=wr{6?R5E(0(+A+2)1526((3-*/-M #MM-8596 )4(OA.v6(,R08  %
 %'GGN$;$'F&2-1YY.2hhsm&+U^* n5W+6.	6@  !
   "6] #E&M 2 #E&M 2"%eHo"6!$U7^!4),U?-C)D*-e4D.E*F'*5z%(^	
 	
 	
y  	S 	x
 	
sP   KJ8 AK%C%KKA K<A-K)BK,K2K3K8K	Kblobc           	         |j                   }|t        d      t        |d      }d||dddifi}	 i }dD ]  }d||<   	 d	d
g|d<   |xs i }t        j                  d| j
                  d d ||d      }|j                  }|t        g       S | j                  ||| j                        S # t        j                  j                  $ r t        j                  d       d }Y `w xY w)Nzblob.source cannot be None.rbinputzapplication/pdfExpires0)generateIDsconsolidateHeadersegmentSentences1r(   r*   teiCoordinatesPOST<   )headersparamsfilesdatatimeoutz%GROBID server timed out. Return None.)source
ValueErroropenr   requestr   r9   r   ReadTimeoutr   r   iterra   r   )	r   rb   r!   pdfrq   rr   paramrr"   s	            r   
lazy_parsezGrobidParser.lazy_parse|   s   KK	:;;9d#9c+<y#>NOP	57DQ "!U"&,c]D!"KRE  ""A vvH
 8O##Ix9O9OPP "".. 	LL@AH	s   AB( (4CC)z1http://localhost:8070/api/processFulltextDocument)r   r   r   r   boolrK   r    r   r   ra   r	   r}   r   r   r   r   r      sw    3
 Q-- - 
	- T
T
(+T
@DT
	(	T
lQt Q(: Qr   r   )loggingtypingr   r   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr	   	getLoggerr   r   	Exceptionr   r   r   r   r   <module>r      sH     . .  - D B			8	$	 	FQ> FQr   