
    ohs:                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZ d dlmZmZ d dlmZ d dlZd dlZd dlmZ d dlZ ej4                  ej6                  d       d	Zd
ZdZ e         e j>                  d      xs dZ e e_!         e	d      Z" e	d      Z# e	d      Z$e$jK                  d        e j>                  dd      Z& e' e j>                  dd            Z( e) e j>                  dd            Z* e	d      Z+dZ,dZ- G d de      Z.dee/   dee0   fd Z1d!e0dee   fd"Z2d#e/de/fd$Z3d4d%ee/   fd&Z4d4d'e0fd(Z5d) Z6d dlmZ d dl7Z7d dl Z d dl7Z7d dlZd dlZd dlm	Z	 d d*l
mZ d d+lmZm8Z8  e' e j>                  d,d-            Z9d d.lmZm8Z8m:Z: e9fd/e	d0e'dee0   fd1Z;de0fd2Z<de0fd3Z=y)5    N)Path)ListOptionalDictAnySet)	BaseModelValidationError)Image)load_dotenvz%%(asctime)s %(levelname)s %(message)s)levelformatx      g      ?OPENAI_API_KEYYOUR_KEY_HEREzocr_output.jsonzall_mcqs.jsondebug_outputsTexist_okOPENAI_VIS_MODELzgpt-4oMAX_OUTPUT_TOKENS2000TEMPERATUREz0.0z
public/tmpzpublic/images/crops_outzpublic/debug_outputsc                       e Zd ZU eed<   eeef   ed<   ee   ed<   dZee   ed<   dZee	e      ed<   dZ
ee   ed<   dZeeeef      ed<   d	Zee   ed
<   y)MCQQuestionOptionsAnswerNExplanation
FigureRefsQuestionLatexOptionsLatexFAmbiguousMath)__name__
__module____qualname__str__annotations__r   r   r   r    r   r!   r"   r#   bool     5/var/www/html/eduruby.in/utils/extractor_functions.pyr   r   .   ss    M#s(^SM!%K#%&*Jc#*#'M8C='-1L(4S>*1$)M8D>)r+   r   resultsreturnc                    t               }| s|S | D ]]  }t        |t              s|j                  d      xs$ |j                  d      xs |j                  d      }|sOt        |t              ro|D ]i  }t        |t
              st        j                  j                  |      }|j                  d      d   j                  d      d   }|j                  |       k t        |t
              s|j                  d      D ]k  }|j                         }|st        j                  j                  |      }|j                  d      d   j                  d      d   }|j                  |       m ` |S )z
    Extract all unique figure references from the final results.
    Handles different formats of FigureRefs in the results.
    r    
figureRefsfigure_refs?r   #,)set
isinstancedictgetlistr'   ospathbasenamesplitaddstrip)r-   referenced_imagesresultr1   reffilenames         r,   &extract_figure_references_from_resultsrD   ;   sJ   
    8&$' jj.g&**\2JgfjjYfNg+t,& 8C!#s+#%77#3#3C#8#+>>##6q#9#?#?#DQ#G)--h78 K-&,,S1 8C))+C#%77#3#3C#8#+>>##6q#9#?#?#DQ#G)--h78%82 r+   textc                    	 t        j                  |       S # t         j                  $ r Y nw xY wg d}|D ]`  }t        j                  || t        j
                        }|D ]4  }	 t        j                  |      c c S # t         j                  $ r Y 2w xY w b 	 | j                  d      }| j                  d      dz   }|dk\  r||kD  rt        j                  | ||       S y# t         j                  $ r Y yw xY w)zGExtract JSON from text, handling various formats including code blocks.)z```(?:json)?\s*([\s\S]*?)\s*```z```\s*([\s\S]*?)\s*```z	`([^`]*)`{}   r   N)jsonloadsJSONDecodeErrorrefindall
IGNORECASEfindrfind)rE   json_patternspatternmatchesmatchstartends          r,   extract_json_from_textrX   `   s    zz$ M ! **WdBMM: 	Ezz%(('' 			#jjo!A:#+::d5o..   s1    --$A==BBAC" "C87C8mcq_datac                    | j                  di       }t        |t              r@i }g d}t        |      D ]%  \  }}|t	        |      k  st        |      |||   <   ' || d<   | S t        |t              rOi }g d}t        |j                               D ](  \  }\  }}|t	        |      k  st        |      |||   <   * || d<   | S )z@Normalize the Options field to be a dictionary with letter keys.r   )ABCDEFGH)r8   r6   r9   	enumeratelenr'   r7   items)rY   options
normalizedlettersioptionkeyvalues           r,   normalize_options_fieldrm      s    ll9b)G '4 
:"7+ 	5IAv3w<),V
71:&	5 ) O 
GT	"
:(9 	4OA|U3w<),U
71:&	4 )Or+   final_resultsc                 V   	 t         j                  j                  t              rPt	        j
                  t               t        j                  t        d       t        j                  dt                	 t         j                  j                  t              rAt        |       }t        j                  dt        |       dt        |              t               }g d	}|D ]  }t!        j                   t         j                  j#                  t        |            D ]R  }t         j                  j%                  |      s#t         j                  j'                  |      }|j)                  |       T  ||z
  }d
}	|D ]x  }
t         j                  j#                  t        |
      }	 t         j                  j                  |      r2t        j*                  |       t        j                  d|
        |	dz  }	z t        j,                  t              D ]  }t         j                  j#                  t        |      }t         j                  j/                  |      sG	 t        j,                  |      s.t        j0                  |       t        j                  d|        nt        j                  d|         t        j                  dt        |       d|	 d       yy# t        $ r*}t        j                  dt         d|        Y d}~d}~ww xY w# t        $ r&}t        j                  d|
 d|        Y d}~d}~ww xY w# t        $ r&}t        j                  d| d|        Y d}~^d}~ww xY w# t        $ r)}t        j                  dt         d|        Y d}~yd}~ww xY w)z
    Clean up directories while preserving images referenced in final results.
    - Always clean debug_outputs completely
    - Only clean crops_out images that are not referenced in final results
    Tr   zCleaned up debug directory: zFailed to clean up : NzFound z referenced images: )z*.jpgz*.jpegz*.pngz*.bmpz*.tiffz*.webpr   zRemoved unreferenced image: rI   zFailed to remove image zRemoved empty subdirectory: z Keeping non-empty subdirectory: zFailed to remove subdirectory z"Cleaned crops_out directory. Kept z referenced images, removed z unreferenced images.)r:   r;   existsDEBUG_OUTPUTS_DIRshutilrmtreemakedirslogginginfo	ExceptionwarningCROPS_OUT_DIRrD   rd   r9   r5   globjoinisfiler<   r>   removelistdirisdirrmdir)rn   er@   
all_imagesimage_patternsrS   img_pathrC   images_to_deletedeleted_countimg_nameitem	item_paths                r,   cleanup_directoriesr      s   H77>>+,MM+,KK)D9LL78I7JKL
1D77>>-( F} ULL6#&7"8!99MdSdNeMfgh JVN) 1 $		"'',,}g*N O 1Hww~~h/#%77#3#3H#="x011  *,== M, O77<<x@Oww~~h/		(+'CH:%NO%*O 

=1 VGGLL=	77==+V!zz)4HHY/#LL+Gv)NO#LL+KD6)RSV LL=cBS>T=UUqr  rA  AV  W  X[ )  H-.?-@1#FGGHB ! OOO&=hZr!$MNNO % V*HbQRPS(TUUV
  D-m_BqcBCCDs   A3K 6CM6 ?A$M6 $AL5A!M6 AM3'M6 	L%L

L	ML<6M6 <MM6 	M3M.(M6 .M33M6 6	N(?N##N(task_idc                    	 | r;t        j                   t        j                  j                  t        d|  d            ng }|j                  t        j                   t        j                  j                  t        d                   |j                  t        j                   t        j                  j                  t        d                   |D ]O  }	 t        j                  j                  |      r-t        j                  |       t        j                  d|        Q y# t        $ r%}t        j                  d| d|        Y d}~|d}~ww xY w# t        $ r"}t        j                  d|        Y d}~yd}~ww xY w)	z<
    Clean up temporary files created during processing
    *z*_results.jsonz*_mcqs.jsonzRemoved temp file: zFailed to remove temp file rp   NzFailed to clean temp files: )r{   r:   r;   r|   TMP_DIRextendrq   r~   rv   rw   rx   ry   )r   
temp_files	temp_filer   s       r,   cleanup_temp_filesr      s   <IPTYYrww||Gq	^DEVX
$))BGGLL:J$KLM$))BGGLL-$HIJ# 	PIP77>>),IIi(LL#6yk!BC		P
  P"=i[1# NOOP  <6qc:;;<sC   CE AD"E "	E+EE EE 	E>E99E>c                 l    g }| D ],  }t        |t              sd|v s|j                  |d          . |S )z9Flatten the page-wise results into a single list of MCQs.mcqs)r6   r7   r   )	json_datamergedpages      r,   merge_pagesr      s=    F (dD!fnMM$v,'( Mr+   )r   )r   ImageOpsMAX_IMAGE_BYTES30000)r   r   
ImageChopsr;   	max_bytesc                    	 t        j                  |       j                  d      }t        j                  |j                  |j                  d      }t        j                  ||      }|j                         }|r|j                  |      }|j                  \  }}d}	t        ||      |	kD  rI|	t        ||      z  }
|j                  t        ||
z        t        ||
z        ft         j                         }|j                  d      }t#        j$                         }|j'                  |dd	d
       t)        |j+                               }||dz  k  r|j                  d      }d}d}t-        d      D ]  }t#        j$                         }|j'                  |d|d
d       |j+                         }t)        |      |k  r+t/        j0                  |      j3                  d      }d| c S |dkD  rt        dt        |dz              }|j                  \  }}|j                  t        ||z        t        ||z        ft         j                         } y# t        $ r%}t	        j
                  d|  d|        Y d}~yd}~ww xY w)a  
    Compress image to a JPEG data URI under max_bytes.
    - Auto-crops whitespace
    - Converts to grayscale if it saves significant space
    - Aggressive resizing & quality tuning
    Returns data URI string or None if compression can't produce small enough image.
    RGBzFailed to open image rp   N)   r   r   i   LJPEG(   r   )r   qualitysubsamplingg?2   g333333?   T)r   r   r   optimizeasciizdata:image/jpeg;base64,      g?)r   openconvertrx   rv   ry   newmodesizer   
differencegetbboxcropmaxresizeintLANCZOSioBytesIOsaverd   getvaluerangebase64	b64encodedecode)r;   r   imgr   bgdiffbboxwhmax_dim_targetscaleimg_graybuf_gray	gray_sizer   scale_factor_bufrawb64s                       r,   compress_image_to_data_urir   
  s0   jj&&u- 
388SXX	7B  b)D<<>Dhhtn88DAq N
1ay>!Q*jj#a%i.#a%i.95==I {{3Hzz|HMM(621MEH%%'(I9s?"u% GL1X \jjlVW!dSllns8y ""3'..w7C,SE22 R<"c'C-01G88DAq**c!l"23S\9I5JKU]][C\  _  /vRs;<s   $I 	I<I77I<c                       y)zKBuild the system prompt for the first pass (extraction and reconstruction).a  You are an expert at extracting multiple choice questions (MCQs) from text. 
Your task is to:
1. Extract all complete MCQs from the provided text
2. Reconstruct mathematical expressions using LaTeX when possible
3. Associate figures with questions when appropriate
4. Skip questions that are clearly incomplete or continue on another page

Return a JSON array of objects with the following structure for each MCQ:
{
  "Question": "The question text",
  "Options": {"A": "Option A", "B": "Option B", ...},
  "Answer": "Correct answer letter (A, B, C, etc.) if available",
  "Explanation": "Explanation if available",
  "FigureRefs": ["List of figure names if applicable"],
  "QuestionLatex": "LaTeX version of the question if it contains math",
  "OptionsLatex": {"A": "LaTeX for option A", ...} if options contain math,
  "AmbiguousMath": true/false if mathematical notation is ambiguous
}

If no MCQs are found, return an empty array [].
r*   r*   r+   r,   build_system_prompt_reconstructr   K  s    r+   c                       y)zDBuild the system prompt for the second pass (validation and repair).a  You are an expert at validating and repairing multiple choice questions (MCQs). 
Your task is to:
1. Validate the logical consistency of each MCQ
2. Repair any issues with formatting, clarity, or completeness
3. Return a valid JSON object for each MCQ (even if you make changes)
4. Only skip questions that are fundamentally flawed or incomplete

Return a JSON object with the same structure as the input, or if the question should be skipped:
{"skip": true, "reason": "Explanation for skipping"}

If the question is valid, return it in the proper format:
{
  "Question": "The question text",
  "Options": {"A": "Option A", "B": "Option B", ...},
  "Answer": "Correct answer letter (A, B, C, etc.) if available",
  "Explanation": "Explanation if available",
  "FigureRefs": ["List of figure names if applicable"],
  "QuestionLatex": "LaTeX version of the question if it contains math",
  "OptionsLatex": {"A": "LaTeX for option A", ...} if options contain math,
  "AmbiguousMath": true/false if mathematical notation is ambiguous
}
r*   r*   r+   r,   build_system_prompt_validater   c  s    r+   )N)>r:   rM   rJ   timer   rv   concurrent.futures
concurrentpathlibr   typingr   r   r   r   r   pydanticr	   r
   PILr   rs   r{   dotenvr   openaibasicConfigINFOPER_PAGE_TIMEOUTMAX_RETRIESRETRY_DELAYgetenvr   api_keyINPUT_PAGES_JSONOUTPUT_MCQ_JSON	DEBUG_DIRmkdir
MODEL_NAMEr   r   floatr   r   rz   rr   r   r7   r'   rD   rX   rm   r   r   r   r   r   r   r   r   r   r   r*   r+   r,   <module>r      s   	 	       1 1 /        ',,/V W  +,? )* '!	 	 RYY)84
			"5v>? IBIImU34
|
 ** 	*) 	*#DJ #3s8 #J! !# !Hd t 0ADtDz ADF< <*  	     ibii 17;< + +<K 9T 9c 9PXY\P] 9B 0c r+   