
    h7                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$  ejJ                  e&      Z'dZ( G d de       Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z. G d d e      Z/ G d! d"e      Z0 G d# d$e      Z1 G d% d&e      Z2g d'Z3y)(zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       e Zd Zy)Qwen3RMSNormN__name__
__module____qualname__     e/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   4       r$   r   c                       e Zd Zy)Qwen3MLPNr   r#   r$   r%   r(   r(   8   r&   r$   r(   c                   ,    e Zd Zdedef fdZ eddd      	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
e   de
ej                     dee   de	ej                  e
ej                     f   fd       Z xZS )Qwen3Attentionconfig	layer_idxc                    t         |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _        |j                  |   dk(  r|j                  | _        y d | _        y )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr+   r,   	__class__s      r%   r1   zQwen3Attention.__init__=   sk    +"4==f6I6IJ"4==f6I6IJ7=7I7I)7TXk7kf33qur$   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                    |j                   d d }g |d| j                  }| j                  | j                  |      j	                  |            j                  dd      }	| j                  | j                  |      j	                  |            j                  dd      }
| j                  |      j	                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                   sdn| j"                  | j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr   r   )sincosrA   eagerg        )dropoutscalingr7   )shaper2   r4   q_projview	transposer5   k_projv_projr   updater,   r   r+   _attn_implementationr	   trainingattention_dropoutrJ   r7   reshape
contiguouso_proj)r8   r>   r?   r@   r;   rA   rB   input_shapehidden_shapequery_states
key_statesvalue_statesrG   rF   cache_kwargsattention_interfaceattn_outputattn_weightss                     r%   forwardzQwen3Attention.forwardC   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r$   )NN)r    r!   r"   r   intr1   r   torchTensortupler   r   
LongTensorr
   r   ra   __classcell__r9   s   @r%   r*   r*   <   s    v{ vs v %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell33	4*) S*)r$   r*   c                       e Zd Zy)Qwen3DecoderLayerNr   r#   r$   r%   rj   rj   q   r&   r$   rj   c                       e Zd Zy)Qwen3PreTrainedModelNr   r#   r$   r%   rl   rl   u   r&   r$   rl   c                       e Zd Zy)
Qwen3ModelNr   r#   r$   r%   rn   rn   y   r&   r$   rn   c                   .     e Zd Zdee   def fdZ xZS )Qwen3ForCausalLMsuper_kwargsrC   c                 "    t        |   di |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r#   )r0   ra   )r8   rq   r9   s     r%   ra   zQwen3ForCausalLM.forward~   s    4 w...r$   )r    r!   r"   r
   r   r   ra   rg   rh   s   @r%   rp   rp   }   s%    /12/ 
 / /r$   rp   c                       e Zd Zy)Qwen3ForSequenceClassificationNr   r#   r$   r%   rt   rt      r&   r$   rt   c                       e Zd Zy)Qwen3ForTokenClassificationNr   r#   r$   r%   rv   rv      r&   r$   rv   c                       e Zd Zy)Qwen3ForQuestionAnsweringNr   r#   r$   r%   rx   rx      r&   r$   rx   )rp   rx   rl   rn   rt   rv   )4__doc__typingr   r   rc   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr    logger_CHECKPOINT_FOR_DOCr   r(   r*   rj   rl   rn   rp   rt   rv   rx   __all__r#   r$   r%   <module>r      s     %    B 6 5 & 0 0 +   - 
		H	%% 	< 		x 	2)^ 2)j	) 		/ 		 	/' /<	%C 		"= 		 9 	r$   