
    h}                        d Z ddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ  ej>                  e       Z!d Z"d Z#d#dZ$ G d dejJ                        Z&d Z' G d dejJ                        Z(e G d de             Z)e G d de)             Z* ed       G d de)e             Z+ ed       G d  d!e)             Z,g d"Z-y)$zPyTorch CTRL model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )DynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 P    dt        j                  dd|dz  z  |z        z  }| |z  S )Nr   i'     )torchpow)posid_model_sizeangle_ratess       d/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr    (   s/    eiiQ!V'DEEK    c                    t        t        j                  | t        j                        j	                  |      j                  d      t        j                  |t        j                        j	                  |      j                  d      |      }t        j                  |d d dd df         }t        j                  |d d dd df         }t        j                  ||gd      }|S )Ndtyper   r   r   dim)	r    r   arangeint64to	unsqueezesincoscat)positionr   r$   
angle_radssinescosinespos_encodings          r   positional_encodingr4   -   s    XU[[144U;EEaH\588?II!LJ IIjADqD)*Eii
1add7+,G99eW-26Lr!   c           	         t        j                  | |j                  dddd            }|j                  d   }|t	        j
                  |      z  }|6|j                  d      |j                  d      }
}	|||
|	z
  |
d |
f   dz  z  }|||z   }t        j                  |d      }|||z  }t        j                  ||      }||fS )	Nr   r   r	   r   r%   g     r&   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	head_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs                r   scaled_dot_product_attentionrK   <   s    Q		!Q1 56I	
B'"''"+5(--b13J3O3OPR3SB4R"crc(9#:T#AA!"9N"J&=2F -	9\\+Q/F$$$r!   c                   @     e Zd Zd fd	Zd Zd Z	 	 	 	 	 	 ddZ xZS )MultiHeadAttentionc                 |   t         |           || _        || _        || _        t        || j                  z        | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t               | _        y N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rR   rS   	__class__s       r   rQ   zMultiHeadAttention.__init__W   s    "("67
))L,7))L,7))L,7YY|\:
Er!   c                    | j                   | j                  z  }t        |      dk(  ry t        || j                  || j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        || j                  z  | _         | j                  j                  |      | _        y )Nr   r   r&   )r   rR   lenr   r\   r   rW   rX   rY   rZ   union)r]   headsattention_head_sizeindexs       r   prune_headszMultiHeadAttention.prune_headsf   s    "//4>>Au:?7t~~Obdhduduvu %TWWe4$TWWe4$TWWe4'

EqA
 #e*4/$..@ --33E:r!   c                 x    |j                  |d| j                  | j                        }|j                  g d      S )Nr%   r   r   r   r	   )reshaperR   rU   r8   )r]   x
batch_sizes      r   split_into_headsz#MultiHeadAttention.split_into_headsw   s-    IIj"dnndjjAyy&&r!   c                    |j                   d   }| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }| j	                  ||      }| j	                  ||      }|#|j                  ||| j                  d|
i      \  }}t        ||||||      }|d   j                  g d      }|d   }|j                  |d| j                        }| j                  |      }||fS )Nr   cache_positionrg   r   r%   )r9   rW   rX   rY   rk   updaterS   rK   r8   rh   r   rZ   )r]   r@   r?   r>   rA   
layer_pastrB   rC   	use_cacheoutput_attentionsrm   rj   rJ   scaled_attentionattnoriginal_size_attentions                   r   forwardzMultiHeadAttention.forward{   s     WWQZ
GGAJGGAJGGAJ!!!Z0!!!Z0!!!Z0!$$Q4>><Ln;]^DAq-aAt^YW!!9,,\:ay"2":"::r4K\K\"]34t|r!   rO   NNNFFN)__name__
__module____qualname__rQ   re   rk   ru   __classcell__r^   s   @r   rM   rM   V   s+    ";"' r!   rM   c                     t        j                  t        j                  | |      t        j                         t        j                  ||             S rO   )r   
SequentialrV   ReLU)r   dffs     r   point_wise_feed_forward_networkr      s2    ==<5rwwy"))CQ]B^__r!   c                   4     e Zd Zd fd	Z	 	 	 	 	 	 ddZ xZS )EncoderLayerc                 B   t         |           t        |||      | _        t	        ||      | _        t        j                  |d      | _        t        j                  |d      | _	        t        j                  |      | _        t        j                  |      | _        y )NrS   gư>eps)rP   rQ   rM   multi_head_attentionr   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)r]   r   rR   r   raterS   r^   s         r   rQ   zEncoderLayer.__init__   sr    $6|YZc$d!2<E,,|>,,|>

4(

4(r!   c	                    | j                  |      }	| j                  |	|	|	|||||||
      }
|
d   }| j                  |      }||z   }| j                  |      }| j	                  |      }| j                  |      }||z   }|f|
dd  z   }|S )Nro   rB   rC   rp   rq   rm   r   r   )r   r   r   r   r   r   )r]   ri   rA   ro   rB   rC   rp   rq   rm   normedattn_outputsattn_outputout1out2
ffn_outputoutputss                   r   ru   zEncoderLayer.forward   s     #00!)/) 1 
 #1ommK0;t$XXd^
]]:.
j 'L,,r!   )g?Nrv   )rw   rx   ry   rQ   ru   rz   r{   s   @r   r   r      s!    
)  "r!   r   c                   "    e Zd ZU eed<   dZd Zy)CTRLPreTrainedModelconfigtransformerc                    t        |t        j                  t        f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weights.g        )meanstdN      ?)
isinstancer   rV   r   weightdatanormal_r   initializer_rangebiaszero_	Embeddingpadding_idxr   fill_)r]   modules     r   _init_weightsz!CTRLPreTrainedModel._init_weights   s   fryy&12 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r!   N)rw   rx   ry   r   __annotations__base_model_prefixr    r!   r   r   r      s    %*r!   r   c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     deeee	j                           dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee   dee   dee   dee   dee	j                     deee	j                     ef   fd       Z xZS )	CTRLModelc                    t         |   |       |j                  | _        |j                  | _        t        |j                  | j                  t        j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                         | _        t        j$                  t'        |j                        D cg c]:  }t)        |j                  |j*                  |j,                  |j.                  |      < c}      | _        t        j2                  |j                  |j4                        | _        | j9                          y c c}w )Nr   r   )rP   rQ   n_embdr   n_layer
num_layersr4   n_positionsr   floatr3   r   r   
vocab_sizewr   
embd_pdropdropout
ModuleListranger   n_headr   resid_pdrophr   layer_norm_epsilon	layernorm	post_init)r]   r   r   r^   s      r   rQ   zCTRLModel.__init__   s     "MM ../0B0BDDUDUW\WbWbcf//?zz&"3"34 v~~. V]]FMM6::vGYGYefg
 fmm9R9RS 	s    ?E,c                     | j                   S rO   r   )r]   s    r   get_input_embeddingszCTRLModel.get_input_embeddings  s    vvr!   c                     || _         y rO   r   )r]   new_embeddingss     r   set_input_embeddingszCTRLModel.set_input_embeddings  s	    r!   c                     |j                         D ]-  \  }}| j                  |   j                  j                  |       / y)zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr   r   re   )r]   heads_to_prunelayerrb   s       r   _prune_headszCTRLModel._prune_heads  s>     +002 	BLE5FF5M..::5A	Br!   	input_idspast_key_valuesrB   token_type_idsposition_idsrC   inputs_embedsrp   rq   output_hidden_statesreturn_dictrm   returnc                 "   |	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }|r|t        | j                         }|r:t        |t              r*t        j                  d       t        j                   |      }||j#                         nd}|>t%        j&                  ||d   |z   t$        j(                  |      }|j+                  d      }||dk  rt        d	      |j                  |d      }|j+                  d
      j+                  d      }|j-                  | j.                        }d|z
  t%        j0                  | j.                        j2                  z  }| j5                  || j                   j6                        }|I|j                  d|d         }| j9                  |      }|t;        j<                  | j>                        z  }nd}|| j9                  |      }|d   }t%        j@                  t%        jB                  ||z   ||z         d
      j-                  |      }|t;        j<                  | j>                        z  }| jD                  j-                  |      | _"        | jD                  |ddf   }||z   |z   }| jG                  |      }|
rdnd}|	rdnd}tI        | jJ                        D ]1  \  }}|
r||fz   } |||||||   ||	|      }|d   }|	s)||d
   fz  }3 | jM                  |      }|
r||fz   }|st        d ||||fD              S tO        ||||      S )aE  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer%   r   z5You have to specify either input_ids or inputs_embeds)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.)r$   devicez$batch_size has to be defined and > 0r   r   r#   r   r   r   c              3   &   K   | ]	  }||  y wrO   r   ).0r@   s     r   	<genexpr>z$CTRLModel.forward.<locals>.<genexpr>  s      bcbos   )last_hidden_stater   hidden_states
attentions)(r   rq   rp   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr<   viewr9   r   r
   r   tupleloggerwarning_oncefrom_legacy_cacheget_seq_lengthr   r(   longr+   r*   r$   finfominget_head_maskr   r   r:   r;   r   triuonesr3   r   	enumerater   r   r   )r]   r   r   rB   r   r   rC   r   rp   rq   r   r   rm   kwargsinput_shaperj   r   past_lengthtoken_type_embedsseq_lenrA   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   s                               r   ru   zCTRLModel.forward  s6   b 2C1N-TXT_T_TqTq!*!6IDKK<Q<Q	$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T0*$++>OOU;U
 +<<_MO:I:Uo446[\ <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,55a8BB1EN ,..TZZ.@N!N2ekk$**6M6Q6QQN &&y$++2E2EF	%+00[_EN $~ 6):):!;; !  FF9-Mb/zz%**W{%:Gk<QRTUVYYZ`a!2!233 !--008&&|Q7
%
25FF]3"6BD0ddff% 	0DAq#$58H$H!*-#A,#"3-	G $AJM 71:-/	0" }5 1]4D D )?<M~^   '+++%	
 	
r!   NNNNNNNNNNNN)rw   rx   ry   rQ   r   r   r   r   r   r   
LongTensorr   FloatTensorboolTensorr   r   ru   rz   r{   s   @r   r   r      se   , B  15EI6:59371559$(,0/3&*15d
E,,-d
 "%e.?.?(@"ABd
 !!2!23	d

 !!1!12d
 u//0d
 E--.d
   1 12d
 D>d
 $D>d
 'tnd
 d^d
 !.d
 
uU\\"$;;	<d
 d
r!   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                        e Zd ZdgZ fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                           deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   dee   deej                     dee	ej                     ef   fd       ZddZ xZS )CTRLLMHeadModelzlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NTr   )
rP   rQ   r   r   r   rV   r   r   lm_headr   r]   r   r^   s     r   rQ   zCTRLLMHeadModel.__init__  sG     $V,yy0A0AM 	r!   r   r   rB   r   r   rC   r   labelsrp   rq   r   r   rm   r   c                 ~   ||n| j                   j                  }| j                  ||||||||	|
|||      }|d   }| j                  |      }d}|* | j                  ||fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)r   rB   r   r   rC   r   rp   rq   r   r   rm   r   r   r   )losslogitsr   r   r   )
r   r   r   r  loss_functionr   r   r   r   r   )r]   r   r   rB   r   r   rC   r   r  rp   rq   r   r   rm   r   transformer_outputsr   	lm_logitsr  rJ   s                       r   ru   zCTRLLMHeadModel.forward  s   x &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.LL/	%4%%  ;;11 	D \$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r!   c                     |B|j                         }|j                  d   |kD  r|}n|j                  d   dz
  }|d d |d f   }|||dS )Nr   )r   r   rp   )r   r9   )r]   r   r   rp   r   r   remove_prefix_lengths          r   prepare_inputs_for_generationz-CTRLLMHeadModel.prepare_inputs_for_generation-  sh     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I&?Ybccr!   )NNNNNNNNNNNNNNN)rw   rx   ry   _tied_weights_keysrQ   r   r   r   r   r   r   r   r   r   r   ru   r  rz   r{   s   @r   r   r     s    ++  15EI6:59371559-1$(,0/3&*15c
E,,-c
 "%e.?.?(@"ABc
 !!2!23	c

 !!1!12c
 u//0c
 E--.c
   1 12c
 ))*c
 D>c
 $D>c
 'tnc
 d^c
 !.c
  
uU\\"$::	;!c
 c
Jdr!   r   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deeeej                           deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee
   dee
   dee
   deeej                     ef   fd       Z xZS )CTRLForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr   )
rP   rQ   
num_labelsr   r   r   rV   r   
classifierr   r  s     r   rQ   z&CTRLForSequenceClassification.__init__L  sR      ++$V,))FMM4??O 	r!   r   r   rB   r   r   rC   r   r  rp   rq   r   r   r   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d	       |t        j                  ||j                  
      |f   }d}|| j                   j"                  | j$                  dk(  rd| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  dk(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                        S )a2  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)
r   rB   r   r   rC   r   rp   rq   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r%   )r   r$   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r  r  r   r   )r   r   r   r  r9   pad_token_idr   r*   r   r   int32r(   argmaxr   r   r^   rw   problem_typer  r$   r   rT   r   squeezer   r   r   r   r   r   )r]   r   r   rB   r   r   rC   r   r  rp   rq   r   r   r  r   r  rj   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctrJ   s                            r   ru   z%CTRLForSequenceClassification.forwardU  s   P &1%<k$++B]B]"..+))%'/!5# / 
 ,A./ *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r!   r   )rw   rx   ry   rQ   r   r   r   r   r   r   r   r   r   r   ru   rz   r{   s   @r   r  r  @  sW     15EI6:59371559-1$(,0/3&*p
E,,-p
 "%e.?.?(@"ABp
 !!2!23	p

 !!1!12p
 u//0p
 E--.p
   1 12p
 ))*p
 D>p
 $D>p
 'tnp
 d^p
 
uU\\"$<<	=p
 p
r!   r  )r  r   r   r   r  ).__doc__typingr   r   numpyr:   r   r   torch.nnr   r   r   cache_utilsr
   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_ctrlr   
get_loggerrw   r   r    r4   rK   ModulerM   r   r   r   r   r   r  __all__r   r!   r   <module>r1     s$     "    A A ' ) i i - Y Y + 
		H	%
%4D DN`/299 /d */ * ** I
# I
 I
X Ad)? AdAdH 
{
$7 {

{
| cr!   