
    Oh(8              
       r   d Z ddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZmZmZmZ ddlmZmZ ddlmZ ddl ddlmZ ddlZ ej2                  ej4                  d	
       dZdZdZ e         ej<                  d      xs dZee_          e
d      Z!e!jE                  d        ej<                  dd      Z# e$ ej<                  dd            Z% e& ej<                  dd            Z' G d de      Z(de$de)dee)   deee*      fdZ+d  Z,d!e*dee*   fd"Z-d#e*de)de.fd$Z/d%e*dee*   fd&Z0y)'a-  
Modified pipeline implementing the user's plan:
1) First pass: extract MCQs and *try to reconstruct mathematical expressions / figure refs* (single prompt function)
2) Second pass: validate/repair each MCQ for logical validity and optionally rephrase while keeping key points
3) Skip any questions that appear to continue on another page (heuristics applied)

Outputs structured, Pydantic-validated JSON using an extended MCQ model.

Notes:
- Keep your OPENAI_API_KEY in .env or env vars as before.
- Tweak MODEL_NAME, timeouts and retry values to taste.

    N)Path)ListOptionalDictAny)	BaseModelValidationError)Image)*)load_dotenvz%%(asctime)s %(levelname)s %(message)s)levelformatx      g      ?OPENAI_API_KEYYOUR_KEY_HEREzpublic/debug_outputsT)exist_okOPENAI_VIS_MODELzgpt-4o-miniMAX_OUTPUT_TOKENS2000TEMPERATUREz0.0c                       e Zd ZU eed<   eeef   ed<   dZee   ed<   dZee   ed<   dZ	ee
e      ed<   dZee   ed<   dZeeeef      ed<   dZee   ed	<   y)
MCQQuestionOptionsNAnswerExplanation
FigureRefsQuestionLatexOptionsLatexAmbiguousMath)__name__
__module____qualname__str__annotations__r   r   r   r   r   r   r   r    r!   bool     1/var/www/html/eduruby.in/utils/openai_pipeline.pyr   r   3   sw    M#s(^ FHSM !%K#%&*Jc#*#'M8C='-1L(4S>*1$(M8D>(r)   r   page_num	page_textfigure_pathsreturnc           
      (   t               }|rdj                  |      }nd}dd|  d| d| dg}|D ]y  }t        |      }|j                         rA	 t	        |      }|r|j                  dd	|id
       nt        j                  d|  d|        _t        j                  d|  d|        { d}
|
t        k  rQ	 d|dd|dg}t        j                  j                  j                  t        |t        t               }t)        |j*                  d   j,                  dd      }|r;t/        d|  d|
 d|dd  d       	 t0        d|  d|
 dz  j3                  |d !       n>t        j4                  d|  d"|
 d#       |
dz  }
t#        j$                  t&        |
z         t7        |      }|Kt9        |t:              r|gS t9        |t<              r|S t        j                  d|  d$t?        |              g S t        j4                  d|  d%       	 d&| d'}d|dd|dg}t        j                  j                  j                  t        |t        d(      }t)        |j*                  d   j,                  dd      }|rLt7        |      }|%t9        |t:              r|gS t9        |t<              r5|S t        j4                  d|  d*       nt        j4                  d|  d+       |
dz  }
t#        j$                  t&        |
z         |
t        k  rQt        j@                  d|  d,       g S # t        $ r)}	t        j                  d|  d| d|	        Y d}	~	!d}	~	ww xY w# t        $ rJ}	t        j                  d|  d|
 d|	        |
dz  }
t#        j$                  t&        |
z         Y d}	~	d}	~	ww xY w# t        $ r Y  w xY w# t        $ rG}	t        j                  d|  d)|	        |
dz  }
t#        j$                  t&        |
z         Y d}	~	\d}	~	ww xY w)-a  
    First pass: ask the model to extract MCQs and attempt to reconstruct math/figure refs.
    Skips obviously incomplete/continued questions (prompt tells model to skip).
    Properly sends images in multimodal format, falling back to listing paths if unavailable.
    z, NonetextzPAGE z TEXT:

z

IMAGE PATHS: )typer1   	image_urlurl)r2   r3   [Page z(] Could not compress image under limit: z] Image compression failed for : Nz] Missing image file: r   systemrolecontentusermodelmessages
max_tokenstemperaturez] OpenAI API error on attempt    r:   zRaw response for page z on attempt    ...page_extract_attemptz_raw.txtzutf-8)encodingz ] Empty model output on attempt z. Retrying...z] Unexpected JSON type: z,] Failed to parse JSON; running repair pass.a'  The previous reply was not valid JSON. Re-format it into a valid JSON array of objects with keys: Question, Options, optional Answer, optional Explanation, optional FigureRefs, optional QuestionLatex, optional OptionsLatex, optional AmbiguousMath. Return ONLY the JSON array.

PREVIOUS_OUTPUT:

z

        z] Repair API error: z2] Repair attempt failed to produce parseable JSON.z ] Repair attempt returned empty.z(] All attempts exhausted; skipping page.)!build_system_prompt_reconstructjoinr   existscompress_image_to_data_uriappendloggingwarning	ExceptionMAX_RETRIESopenaichatcompletionscreate
MODEL_NAMEr   r   timesleepRETRY_DELAYgetattrchoicesmessageprint	DEBUG_DIR
write_textinfoextract_json_from_text
isinstancedictlistr2   error)r+   r,   r-   system_promptimg_list_str
user_partsfig_path_strfig_pathdata_urieattemptr>   respraw_textparsed_jsonrepair_promptrepair_messagesrepair_resprepair_textrepaireds                       r*   extract_and_reconstructru   A   sf    45M yy. 5
*YKGXYeXf!ghJ
 % Q%??c5h?%%{%QYIZ&[\OOfXJ6^_g^h$ij OOfXJ.DXJOPQ G
K
	!m<J7H ;;**11 !,'	 2 D 4<<?22ItD*8*L	HUYVYNK[[^_`tH:-=gYhOO[[\dov[w LL6(+KG9TabcqLGJJ{W,-,X6"+t,#}$K.""&
2J4P[K\J] ^_	 	vhZ'STU	' (0j	6  "m<M:O !++1188 (,	 9 K "+"5"5a"8"@"@)TRK -k:H#h-$:%$/#OvhZ/abcLL6(+KLM1

;()e K
h MMF8*$LMNIw  c&
2QRZQ[[]^_]` abbc$  	OOfXJ.LWIUWXYWZ[\qLGJJ{W,-		  P  	OOfXJ.B1#FGqLGJJ{W,-		s\   ?L&;AM =!N1 A)O &	M/MM	N.$?N))N.1	N>=N>	P
<PPc                     	 | j                   d   j                  j                  S # t        $ r" 	 | d   d   d   d   cY S # t        $ r Y Y yw xY ww xY w)z<Robustly extract assistant text content across SDK variants.r   rZ   r[   r:   N)rZ   r[   r:   rO   )rm   s    r*   _get_message_contentrw      s`    ||A&&... 		?1%i0;; 		s)   "% 	A?A	AAAAmcq_itemc                    t               }t        j                  | d      }d}|t        k  rF	 d|dd|dg}t        j
                  j                  j                  t        |dd	      }t        |      }|s7t        j                   d       |dz  }t        j                  t        |z         t#        |      }|7t        j                   d       |dz  }t        j                  t        |z         t%        |t&              r&t)        |      dk(  rt%        |d   t*              r|d   }t%        |t*              r:|j-                  d      r)t        j                   d|j-                  dd              y|S t        j                   d       y# t        $ rG}t        j                  d
| d|        |dz  }t        j                  t        |z         Y d}~d}~ww xY w)z
    Second pass: check logical validity and (if valid) return a cleaned object.
    If invalid or incomplete, return None.
    F)ensure_asciir   r7   r8   r;   i  rG   r<   z Validation API error on attempt r6   rA   Nz&Validation returned empty. Retrying...z)Validation JSON parse failed. Retrying...skipzItem skipped by validator: reasonz	no reasonz-Validation attempts exhausted; skipping item.)build_system_prompt_validatejsondumpsrP   rQ   rR   rS   rT   rU   rO   rM   rN   rV   rW   rX   rw   r_   r`   ra   rc   lenrb   get)	rx   re   user_contentrl   r>   rm   rk   rn   parseds	            r*   validate_singler      s   
 12M::hU;LG
K
	!m<L9H ;;**11 !	 2 D (-LLABqLGJJ{W,- (1>LLDEqLGJJ{W,- fd#Fq(8Zq	SW=XAYF fd#

6(:LL6vzz(K7X6YZ[ LL@AG  	OO>wir!MNqLGJJ{W,-		s   ;F
 
	G<GGitemc                 :   | j                  d      }|rt        |t        t        f      syt        |t              rt	        |      dk  ry| j                  dd      dz   dj                  t        |t              r|j                         n|      z   j                         }d|v sd|v sd	|v ryd
|v s#d|v r |j                         j                  d      ryt        |t              r8|j                         D ]%  \  }}|rt	        |j                               dk  s% y y)Nr   Tr   r     zcontinued on next page	continuedcontdrC   u   —F)r   ra   rb   rc   r   rI   valueslowerstripendswithitems)r   r,   optscombinedkvs         r*   is_incomplete_mcq_candidater     s    88IDz$t5$#d)a-R(3.JW[]aLb4;;=hl1nnuuwH8+{h/F'U]J]EX-(..2B2K2KE2R$JJL 	DAqAGGI*	 r)   page_objc           
      x   | j                  d      }| j                  dd      xs d}| j                  dg       xs g }t        |||      }|sg S g }t        |d      D ]  \  }}t        |t              st        j                  d| d|        2t        ||      rt        j                  d| d	| d
       [t        |      }|st        j                  d| d| d       t        |      }d|v r2t        |d   t              r|d   D 	cg c]  }	t        |	       c}	|d<   	 t        di |}
	 |j                  |
j                                 |S c c}	w # t         $ r" |j                  |
j                                Y 3w xY w# t"        $ r)}t        j                  d| d| d|        Y d }~Vd }~ww xY w)NrD   r1   r   figuresrA   )startr5   z] skipping non-object item #z] skipping item #u6    — detected as incomplete/continued on another page.z] item #z. was skipped by validator or failed to repair.r   z'] Pydantic validation failed for item #r6   r(   )r   ru   	enumeratera   rb   rM   rN   r   r_   r   normalize_options_fieldrc   r%   r   rL   
model_dumprO   r	   )r   r+   r,   r-   	raw_items	validatedir   rt   xmcqves               r*   process_page_objr     s   ||F#HVR(.BI<<	2.4"L')\JI	IYa0 4$%OOfXJ.J1#NO 'tY7LL6(+<QC?uvw #4(LL6(8A36def +848#
8L3I4(P6>|6L%Mc!f%MH\"	//C3  !121>  &N  3  !123 	OOfXJ.UVWUXXZ[]Z^_`	s<   E%F1E(FFFF	F9F44F9)1__doc__osrer~   rV   base64rM   concurrent.futures
concurrentpathlibr   typingr   r   r   r   pydanticr   r	   PILr
   utils.extractor_functionsdotenvr   rQ   basicConfigINFOPER_PAGE_TIMEOUTrP   rX   getenvr   api_keyr]   mkdirrU   intr   floatr   r   r%   rb   ru   rw   r   r'   r   r   r(   r)   r*   <module>r      sv   
 	       , , /  '     ',,/V W  +,? '(	 	 RYY)=9
			"5v>? IBIImU34)) )wc wc wc wW_`dei`jWk wt8d 8x~ 8xd s t .)t )T
 )r)   