
    hF                         d dl mZmZmZ ddlmZ erddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  e       rd dlZ ej"                  e      ZdZ G d	 d
e      Zy)    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                   B    e Zd ZdZdZdZdgZ fdZd Zd Z	d#d	Z
d
ddddedeeef   fdZ	 d$d
ddddedddeeef   deee      fdZd%dZd
ddee   dee   fdZ	 d$d
ddeee      fdZdee   dedee   fdZd ZdedefdZd  Zd$d!Zedefd"       Z xZS )&Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                 B    t        |   |fi | || _        d | _        y N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__s      e/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__1   s&    ,77#6 "&    c                     | j                    	 ddlm}  |d      | _         | j                   S | j                   S # t        $ r t        d      w xY w)z3Lazy import and initialize kernels only when neededr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6   s]    ""*X.*45W*X' &&&t&&&  X!"VWWXs	   9 Ac                    t               st        d      | j                  j                  ry t        j
                  j                         s>| j                  r't        j                  d       d| j                  _        y t        d      t               st        d      t        j
                  j                         }|dk\  }t        d      xr
 t               }| j                  rR|s't        j                  d       d| j                  _        y |sAt        j                  d	       d| j                  _        y |st        d
      |st        d      | j                  s| j!                          |j#                  d      }|t        j                  d       y |N| j                  sAt%        |t&              r0d|j)                         v sd|j)                         v rt        d      y y y y )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`)      z3.4.0zMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). We will default to dequantizing the model to bf16.ztMXFP4 quantization requires triton >= 3.4.0 and kernels installed, we will default to dequantizing the model to bf16zmMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)zAMXFP4 quantization requires triton >= 3.4.0 and kernels installed
device_mapzYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r!   r   
dequantizetorchcudais_availablepre_quantizedloggerwarning_onceRuntimeErrorr
   get_device_capabilityr   r   
ValueErrorr"   get
isinstancedictvalues)r   argsr   compute_capabilitygpu_is_supportedkernels_availabler&   s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environmentA   s   !#] 
 ##..zz&&(!!##t 7;((3"#RSS&(YZZ"ZZ==?-7/8S=Q=S###I 7;((3$## K 7;((3!  #`aa!!%%'ZZ-
| #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                 V    |&t         j                  }t        j                  d|       |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r*   bfloat16r.   info)r   dtypes     r   update_dtypezMxfp4HfQuantizer.update_dtype   s.    =NNEKK@  r   modelr	   param_valueztorch.Tensor
param_name
state_dictc                    ddl m} ddlm} | j                  j
                  r%d|v sd|v rt        ||d t        d              \  }}	nt        ||      \  }}	t        ||      s"t        ||      r| j                  j
                  r|	dv ryy	y)
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrH   models.gpt_oss.modeling_gpt_ossrJ   r   r)   r   lenr4   )
r   rB   rC   rD   rE   r   rH   rJ   moduletensor_names
             r   check_quantized_paramz&Mxfp4HfQuantizer.check_quantized_param   s     	6C ##..H
4JhZdNd"6ujIZCPYN?>["\FK"6uj"IFKf01v}-$2J2J2U2UEEr   target_deviceztorch.deviceunexpected_keysc                    ddl m}m}	m}
m}m} ddlm} | j                  s| j                         }t        ||      \  }}t        j                  |      5  t        ||      r |||      \  }}|j                  j                  |j                  j                   |j                  j"                  }}} ||||      \  }}d|v rdnd}t%        |||       t%        || d || | |                          t'        || d	       t'        || d
       d d d        y |j)                  d      }|j)                  d      }|j)                  d      }|j)                  d      }|j)                  d      }d|v sd|v r3| j*                  j                  rt        ||d t-        d	              \  }}nt        ||      \  }}||||||d}t        ||      s"t        ||      rf| j*                  j                  rO| j*                  j                  r|d t-        d	        } |	|||||fi | y  |
||||| j                         fi | y y y # 1 sw Y   y xY w)Nr   )rH   r)   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rI   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrM   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrK   rL   )rc   rd   re   rf   rg   rB   )rP   rH   r)   rY   rZ   r[   rQ   rJ   r-   r"   r   r*   devicer4   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr3   r   rR   )r   rB   rC   rD   rV   rE   rW   r   rH   r)   rY   rZ   r[   rJ   r   rS   _triton_weight_tensorr`   rj   rk   rl   projrc   rd   re   rf   rg   shard_kwargsdq_param_names                                 r   create_quantized_paramz'Mxfp4HfQuantizer.create_quantized_param   sz   	
 	
 	D!!!%!:!:!<,UJ?IFAm, 6f&899J;Xj9k6(,*55EE*55==*55@@ /9WO
 :G,l<N:6(, .<z-I>{DFD*>?& 12'\G]g]iLjk FtfG$45FtfG$45+6 64 !**]3K"JJ7M"JJ7M::f%D **]3KJ&(j*@dF^F^FiFi0
CTc)n_8UV	0
C	  +!.!.*L &"456=1d6N6N6Y6Y++66 %//@#i.$AMvz;}m`lm*"#%113 ' 7Z1_6 6s   B?IIc                     | j                   j                  r| j                  |       t        j                  j                         rt        j                  j                          y y r   )r   r)   remove_quantization_configr*   r+   r,   empty_cache)r   rB   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading  sD    ##..++E2::""$JJ""$ %r   expected_keyscheckpoint_keysc                    g }|D ]C  }|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          M|j                  d      r8|d t        d        }|j                  |dz          |j                  |dz          | j                  s|j                  d	      r$|d t        d        }|j                  |dz          |j                  d
      r%|d t        d        }|j                  |dz          |j                  d      r |j                  |       3|j                  |       F |S )Nz.mlp.experts.gate_up_projr\   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projr]   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrL   )endswithrR   appendr-   )r   rB   ry   rz   new_expected_keyskeybases          r   update_expected_keysz%Mxfp4HfQuantizer.update_expected_keys  sS     	.C||781c.112!((0E)EF!((0E)EF67.c+../!((0B)BC!((0B)BC''<< ?@9#&8"9!9:D%,,TK-?@\\"DE<#&;"<!<=D%,,TN-BC\\(+%,,S1!((-/	.0 ! r   keep_in_fp32_modulesc                 j   ddl m} | j                  || j                  j                  |      | _        |j                  dd      }|r&t        j                  d       d| j                  _        |j                  } ||| j                  | j                  |      }| j                  |j                  _        y )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rP   r   get_modules_to_not_convertr   r   r3   r.   r/   r)   r   )r   rB   r   r   r   r   r   s          r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading)  s     	=&*&E&E4++BBDX'
# jj6e 37D$$/)#'#>#> $ 8 8	
 ,0+C+C(r   missing_keysprefixc                 $   ddl m} g }|j                         D ]\  \  }}t        ||      s|D ]E  }||v s
|| d| v s|j	                  d      r#|j	                  d      r5|j                  |       G ^ |D 	cg c]	  }	|	|vs|	 c}	S c c}	w )Nr   rG   .z.weightz.bias)rP   rH   named_modulesr4   r   r   )
r   rB   r   r   rH   not_missing_keysnamerS   missingks
             r   update_missing_keysz$Mxfp4HfQuantizer.update_missing_keysH  s    5!//1 	9LD&&"45+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs   <	BBc                     d|j                   j                  v r-t        |dd        |j                  j	                  ddddd       |S )NGptOssConfigbase_model_tp_plangrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   update)r   r   s     r   update_tp_planzMxfp4HfQuantizer.update_tp_planW  sR    V--666v3T:F))00DRDRAOAO	 r   c                 2   | j                   j                  r.d|v r|j                  dd      S d|v r|j                  dd      S |S | j                  sF|j	                  d      r|j                  dd      S |j	                  d      r|j                  dd      S |S )NrM    rb   r\   r|   r]   r~   )r   r)   replacer-   r   )r   rD   s     r   update_param_namez"Mxfp4HfQuantizer.update_param_named  s    ##..J&!)))R88j(!)))R88  ##"">2!)).:OPP"";/!))+7IJJr   c                 d   ddl m} |j                         }|j                         D ]  \  }}t	        ||      st        |d      s!t        |d      s.|j                  j                  j                  j                  |j                  j                  j                        j                  dd      j                  dddd	      || d
<   |j                  j                  j                  j                  j                  |j                  j                  j                  j                        j                  dd      || d<   |j                  j                  j                  j                  |j                  j                  j                        j                  dd      j                  dddd      || d<   |j                   j                  j                  j                  j                  |j                   j                  j                  j                        j                  dd      || d<    |S )Nr   rG   r\   r]       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rP   rH   rE   r   r4   hasattrr\   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configr`   r]   down_proj_precision_config)r   rB   rH   rE   r   rS   s         r   get_state_dictzMxfp4HfQuantizer.get_state_dictq  s   5%%'
!//1 	LD&6#56FN3FK0 ''//66EEfFYFYFaFaFfFfgYr2&WRR, dV#789 88EEMMTTcc<<IIQQVViB' dV#789 $$,,33BB6CSCSC[C[C`C`aYr2&WRr2. dV#456 55BBJJQQ``99FFNNSSiB' dV#456+	6 r   c                      y)NT )r   safe_serializations     r   is_serializablez Mxfp4HfQuantizer.is_serializable  s    r   c                 .    t         j                  d       y)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r.   r/   )r   s    r   is_trainablezMxfp4HfQuantizer.is_trainable  s     x	
 r   )r@   torch.dtyper<   r   r   )rB   r	   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r"   r;   rA   strr5   r   rU   r   listrt   rx   r   r   r   r   r   r   r   propertyboolr   __classcell__)r   s   @r   r   r   '   s    (,$ %'
	'GR
  $ 	
 cN> 04T T $T 	T
 &T cNT "$s),Tl%!*; !DQTI !hlmphq !@ 59D D 'tCy1D>FtCy F# FRVWZR[ FC C  D d  r   r   )typingr   r   r   r   r   modeling_utilsr	   utilsr
   r   r   r   r   quantizers_utilsr   r*   
get_loggerr   r.   r   r   r   r   r   <module>r      sX    0 /  0  3 			H	% t{ tr   