
    h\                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  G d dejT                        Z+ G d dejT                        Z,dejZ                  de.dejZ                  fdZ/	 d3dejT                  dejZ                  dejZ                  dejZ                  deejZ                     d e0d!e0d"ee!   fd#Z1d$ Z2d4d%Z3 G d& d'ejT                        Z4 G d( d)ejT                        Z5 G d* d+e      Z6e" G d, d-e             Z7e" G d. d/e7             Z8e" G d0 d1e7e             Z9g d2Z:y)5    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )Cohere2Configc                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )Cohere2RotaryEmbeddinginv_freqconfigc                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   F)
persistent)super__init__hasattr
isinstancer"   dictgetr#   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr    r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)selfr    devicer   	__class__s       j/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/cohere2/modeling_cohere2.pyr(   zCohere2RotaryEmbedding.__init__.   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%    c                 .   | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabled   dimdtype)r   floatexpandshaper*   r5   r$   strtorchautocast	transposerepeat_interleavecosr1   sintorC   )
r4   xposition_idsinv_freq_expandedposition_ids_expandedr=   freqsembrL   rM   s
             r7   forwardzCohere2RotaryEmbedding.forward?   sD    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFFN)__name__
__module____qualname__rH   Tensor__annotations__r   r(   no_gradr   rU   __classcell__r6   s   @r7   r   r   +   s=    ll/} /" U]]_<  <r8   r   c                   &     e Zd Zd fd	Zd Z xZS )Cohere2LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)r'   r(   nn	ParameterrH   onesweightvariance_epsilon)r4   hidden_sizeepsbiasr6   s       r7   r(   zCohere2LayerNorm.__init__P   s/    ll5::k#:; #r8   c                    |j                   }|j                  t        j                        }|j	                  dd      }||z
  j                  d      j	                  dd      }||z
  t        j                  || j                  z         z  }| j                  j                  t        j                        |z  }|j                  |      S )Nr:   T)keepdimr?   )	rC   rN   rH   float32meanpowrsqrtrf   re   )r4   hidden_statesinput_dtyperm   variances        r7   rU   zCohere2LayerNorm.forwardV   s    #))%((7!!"d!3!D(--a055b$5G&-XH]H]=]1^^u}}5E,,r8   )Ngh㈵>FrW   rX   rY   r(   rU   r]   r^   s   @r7   r`   r`   O   s    $-r8   r`   rp   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rF   rE   reshape)rp   rt   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr|   `   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr?   r   r:   )rA   rC   )ptrainingr   )r|   num_key_value_groupsrH   matmulrJ   rF   rb   
functionalsoftmaxrl   rN   rC   r   r   
contiguous)r}   r~   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r7   eager_attention_forwardr   l   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                     | dd d df   }| ddd df   }t        j                  | |gd      j                  d      }|S )N.r?   r   r:   r@   r   )rH   stackflatten)rO   x1x2rot_xs       r7   rotate_halfr      sL    	
3!8B	
319BKK"b	r*2226ELr8   c                 6   | j                   }| j                         } |j                         }|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      |j	                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rB   )rC   rD   	unsqueezer   rN   )	qkrL   rM   rP   unsqueeze_dimrC   q_embedk_embeds	            r7   apply_rotary_pos_embr      s    ( GGE		A		A
--
&C
--
&C3w;q>C/0G3w;q>C/0G::E:"GJJUJ$;;;r8   c                   Z    e Zd ZdZddedee   f fdZ eddd      	 	 dd	e	j                  d
ee	j                  e	j                  f   dee	j                     dee   dee	j                     dee   dee	j                  ee	j                     eee	j                        f   fd       Z xZS )Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr    	layer_idxc                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        |j                  |   dk(  r|j                  nd | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j
                  |j                  | j                  z  |j"                        | _        t        j                   |j                  | j                  z  |j
                  |j"                        | _        y )Nr{   g      Tsliding_attentionri   )r'   r(   r    r   getattrrg   num_attention_headsr{   ry   r   r   attention_dropout	is_causallayer_typessliding_windowrb   Linearattention_biasq_projk_projv_projo_projr4   r    r   r6   s      r7   r(   zCohere2Attention.__init__   sq   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!97=7I7I)7TXk7kf33quii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r8   past_key_valuepast_key_values4.58new_nameversionrp   position_embeddingsr   cache_positionr   ru   c                 b   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}| j                  t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                   | j"                  | j                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr:   r   r?   )rM   rL   r   eager        )r   r   r   )rF   r{   r   viewrJ   r   r   r   r   updater   r   r    _attn_implementationr   r   r   r   rw   r   r   )r4   rp   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rL   rM   cache_kwargsattention_interfacer   r   s                     r7   rU   zCohere2Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S*';L*VY[^'_$L*&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r8   rV   )NN)rW   rX   rY   __doc__r   r   intr(   r   rH   rZ   tupler   
LongTensorr   r   rU   r]   r^   s   @r7   r   r      s    G
} 
# 
0 %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*) S*)r8   r   c                   $     e Zd Z fdZd Z xZS )
Cohere2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r'   r(   r    rg   intermediate_sizerb   r   	gate_projup_proj	down_projr   
hidden_actact_fnr4   r    r6   s     r7   r(   zCohere2MLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r8   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rV   )r   r   r   r   )r4   rO   r   s      r7   rU   zCohere2MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r8   rs   r^   s   @r7   r   r      s    0r8   r   c                   X    e Zd Zdedef fdZ eddd      	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
e   de
e   de
ej                     dee   de	ej                   e
e	ej                   ej                   f      f   fd       Z xZS )Cohere2DecoderLayerr    r   c                     t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        |j                  |   | _        y )N)r    r   rg   rh   )r'   r(   rg   r   	self_attnr   mlpr`   layer_norm_epsinput_layernormr   attention_typer   s      r7   r(   zCohere2DecoderLayer.__init__  se    !--)9Mf%/V=O=OV\VkVkl$00;r8   r   r   r   r   rp   r   r   	use_cacher   r   ru   c           
          |}| j                  |      } | j                  d||||||d|\  }	}
| j                  |      }||	z   |z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rp   r   r   r   r   r    )r   r   r   )r4   rp   r   r   r   r   r   r   residualhidden_states_attention_hidden_states_mlps               r7   rU   zCohere2DecoderLayer.forward  sx    < !,,];%3T^^ &
' 3)+)&
 &
" !HH]3 #::=NNr8   )NNFN)rW   rX   rY   r   r   r(   r   rH   rZ   r   r   r   boolr   r   r   FloatTensorrU   r]   r^   s   @r7   r   r     s    <} < < %0A6R
 26+/$)59+||+ #5<<#=>+ !.	+
 "%+ D>+ !!1!12+ -.+ 
u  (51B1BEDUDU1U+V"WW	X+ S+r8   r   c                   J    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZy)Cohere2PreTrainedModelr    modelTr   r   )rp   
attentionsN)rW   rX   rY   r   r[   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr   r8   r7   r   r   =  sQ    &*#./#4"5N!"&,&r8   r   c                       e Zd Zdef fdZee	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     dee   d	eej                     d
ee   defd              Z xZS )Cohere2Modelr    c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   r    F)r'   r(   pad_token_idpadding_idx
vocab_sizerb   	Embeddingrg   embed_tokens
ModuleListrangenum_hidden_layersr   layersr`   r   normr   
rotary_embgradient_checkpointing	post_initr   s      r7   r(   zCohere2Model.__init__R  s     !.. ++LL):):F<N<NPTP`P`ammEJ6KcKcEde	 3e
 %&2D2D6K`K`a	0?&+# 	 fs   D	input_idsr   rP   r   inputs_embedsr   r   r   ru   c           
         |d u |d uz  rt        d      || j                  |      }|r$|"| j                  st        | j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d
i |t        d
i |d}
|}| j                  ||      }| j                   D ]  } ||f||
|j"                     |||d|}  | j%                  |      }t'        ||	      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r5   )r    input_embedsr   r   r   rP   )full_attentionr   )r   r   r   r   r   )last_hidden_stater   r   )
ValueErrorr  r   r	   r    get_seq_lengthrH   arangerF   r5   r   r*   r+   r   r   r  r  r   r  r   )r4   r
  r   rP   r   r  r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrp   r   decoder_layers                  r7   rU   zCohere2Model.forwardb  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L?-F++ -"0"0#2 ,K #5"C{"C%F%U%U#
 &"oom\J![[ 		M)$72=3O3OP /#- M		 		-0&++
 	
r8   )NNNNNNN)rW   rX   rY   r   r(   r   r   r   rH   r   rZ   r   r   r   r   r   r   rU   r]   r^   s   @r7   r   r   P  s    }    151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
!<
  <
r8   r   c                       e Zd ZdgZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	eeee
j                      f      de	e
j                      de	e
j                     de	e   de	e   de	e   de	e
j                     deee
j                  f   dee   defd              Z xZS )Cohere2ForCausalLMzlm_head.weightlm_headcolwise_reprp   logitsc                 ,   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _
        | j                          y r   )r'   r(   r   r   r   rb   r   rg   r  logit_scaletie_word_embeddingsr	  r   s     r7   r(   zCohere2ForCausalLM.__init__  sq     !&)
 ++yy!3!3V5F5FUS!--#)#=#=  	r8   r
  r   rP   r   r  labelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   ru   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }|| j                  z  }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, Cohere2ForCausalLM

        >> model = Cohere2ForCausalLM.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r
  r   rP   r   r  r   r   r!  r   )r  r  r   )lossr  r   rp   r   r   )r    r   r!  r   r  r*   r   slicer  r  loss_functionr   r   r   rp   r   )r4   r
  r   rP   r   r  r  r   r   r!  r   r"  r   outputsrp   slice_indicesr  r$  s                     r7   rU   zCohere2ForCausalLM.forward  s+   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$***%4%%pVFt{{OeOepiopD%#33!//))
 	
r8   )NNNNNNNNNNr   )rW   rX   rY   _tied_weights_keys_tp_plan_pp_planr(   r   r   r   rH   r   rZ   r   r   listr   r   r   r   r   r   rU   r]   r^   s   @r7   r  r    sw   *+=)H_-z:;H	  151537KO59-1$(,0/35934H
E,,-H
 !.H
 u//0	H

 "%tE4E4E/F(F"GHH
   1 12H
 ))*H
 D>H
 $D>H
 'tnH
 !!1!12H
 c5<</0H
 +,H
 
 H
  H
r8   r  )r  r   r   )r   )Nr   );typingr   r   r   rH   torch.nnrb   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_cohere2r   Moduler   r`   rZ   r   r|   rD   r   r   r   r   r   r   r   r   r  __all__r   r8   r7   <module>r?     s  , - ,   ! . ) R B 9 O K F & I I 0 / 0!<RYY !<H-ryy -"	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4<<F)ryy F)R  54 5p _  $ O
) O
 O
d Z
/ Z
 Z
z Kr8   