o
    \iA                     @   s>   d Z ddlZddlmZ ddlmZ dd ZG dd	 d	ZdS )
zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse   )squared_normc                 C   sV   | j d }t| r| jtj|df||fd |   S |dddf |  }| j| S )z/Compute the sandwich product X.T @ diag(W) @ X.r   shapeN)r   r   issparseT
dia_matrixtoarray)XW	n_samplesWX r   /var/www/www-root/data/www/176.119.141.140/sports-predictor/venv/lib/python3.10/site-packages/sklearn/linear_model/_linear_loss.pysandwich_dot   s   


r   c                   @   s   e Zd ZdZdd ZdddZdd Zd	d
 Zdd Z				dddZ					dddZ
				dddZ						dddZ	dddZdS )LinearModelLossa
	  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the average of per sample losses and includes a term for L2
    regularization::

        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
                  + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

        Shape of gradient follows shape of coef.
        gradient.shape = coef.shape

        But hessian (to make our lives simpler) are always 2-d:
        if base_loss.is_multiclass:
            hessian.shape = (n_classes * n_dof, n_classes * n_dof)
        else:
            hessian.shape = (n_dof, n_dof)

    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                 C   s   || _ || _d S N)	base_lossfit_intercept)selfr   r   r   r   r   __init__h   s   
zLinearModelLoss.__init__Nc                 C   s\   |j d }| jj}| jr|d }n|}| jjr$tj|||f|dd}|S tj|||d}|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)r   dtypeorder)r   r   )r   r   	n_classesr   is_multiclassnp
zeros_like)r   r   r   
n_featuresr   n_dofcoefr   r   r   init_zero_coefl   s   

zLinearModelLoss.init_zero_coefc                 C   s   | j js| jr|d }|dd }||fS d}|}||fS |jdkr.|j| j jdfdd}n|}| jrI|dddf }|ddddf }||fS d}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r   r   r   )r   r   r   ndimreshaper   )r   r"   	interceptweightsr   r   r   weight_intercept   s"   
z LinearModelLoss.weight_interceptc                 C   s<   |  |\}}| jjs|| | }n||j | }|||fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r+   r   r   r   )r   r"   r   r*   r)   raw_predictionr   r   r   weight_intercept_raw   s
   
z$LinearModelLoss.weight_intercept_rawc                 C   s&   |j dkr	|| nt|}d| | S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r   g      ?)r'   r   )r   r*   l2_reg_strengthnorm2_wr   r   r   
l2_penalty   s   zLinearModelLoss.l2_penaltyr%   r   c                 C   s\   |du r|  ||\}}	}n| |\}}	| jj||d|d}
tj|
|d}
|
| || S )a  Compute the loss as weighted average over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.
        Ny_truer,   sample_weight	n_threadsr*   )r-   r+   r   lossr   averager0   )r   r"   r   yr3   r.   r4   r,   r*   r)   r6   r   r   r   r6      s   'zLinearModelLoss.lossc                 C   s^  |j | jj\}}	}
|	t| j }|du r| ||\}}}n| |\}}| jj||||d\}}|du r8|nt	|}|	 | }|| 
||7 }|| }| jjsutj||jd}|j| ||  |d|	< | jrq|	 |d< ||fS tj|
|f|jdd}|j| ||  |ddd|	f< | jr|j	dd|dddf< |jd	kr|jdd
}||fS )a\  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr1   r   r$   r   r   r   r   axisr   r&   )r   r   r   intr   r-   r+   loss_gradientr   sumr0   r   
empty_liker   r   emptyr'   ravel)r   r"   r   r8   r3   r.   r4   r,   r   r    r   r!   r*   r)   r6   grad_pointwisesw_sumgradr   r   r   r>   
  s8   *

"
zLinearModelLoss.loss_gradientc                 C   s6  |j | jj\}}	}
|	t| j }|du r| ||\}}}n| |\}}| jj||||d}|du r6|nt	|}|| }| jj
sctj||jd}|j| ||  |d|	< | jra|	 |d< |S tj|
|f|jdd}|j| ||  |ddd|	f< | jr|j	dd|dddf< |jd	kr|jdd
S |S )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr1   r9   r$   r   r:   r   r;   r   r&   )r   r   r   r=   r   r-   r+   gradientr   r?   r   r@   r   r   rA   r'   rB   )r   r"   r   r8   r3   r.   r4   r,   r   r    r   r!   r*   r)   rC   rD   rE   r   r   r   rF   X  s4   '"
zLinearModelLoss.gradientc
                 C   s*  |j | jj\}
}}|t| j }|	du r| ||\}}}	n| |\}}|du r,|
nt|}|du r?tj	||j
dd}n!|j |j krRtd|j  d|j  d| jjr^|jjs^td|}|j}|du rrtj||f|j
d}n'|j ||fkrtd	||f d
|j d| jjr|jjs|jjstd|}| jjs.| jj||	||d\}}|| }|| }tj|dk|ddk}t|}|j| ||  |d|< | jr| |d< |r|||fS t|||d|d|f< |dkr|jjrdnd}|jd|dd|| |d   |7  < | jr,|j| }||dddf< ||dddf< | |d< nb| jj||	||d\}}|| }|j||fdd}|j| ||  |ddd|f< | jrh|jdd|dddf< |jdkrt|jdd}|dur~|| }nd| }t|D ]}|dd|f d|dd|f   | }t|||||| |||| |f< | jr|j| }||||| ||| | f< |||| | ||| |f< | ||| | || | f< t|d |D ]v}|dd|f  |dd|f  | }t|||||| |||| |f< | jrL|j| }||||| ||| | f< |||| | ||| |f< | ||| | || | f< ||d||d|f ||d||d|f< qq|dkr|jjrrdnd}|jd|dd|d | | || d   |7  < d}|||fS )a~  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray of shape (n_dof, n_dof) or             (n_classes * n_dof, n_classes * n_dof)
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray of shape (n_dof, n_dof) or             (n_classes, n_dof, n_dof, n_classes)
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than 25% of its elements non-positive.
        Nr   r:   z4gradient_out is required to have shape coef.shape = z; got .z"gradient_out must be F-contiguous.r9   z'hessian_out is required to have shape (z); got hessian_out.shape=zhessian_out must be contiguous.r1   r   r5   g      ?r$   Cr&   r   )r$   r$   r;   g      ?r   F)r   r   r   r=   r   r-   r+   r   r?   r@   r   
ValueErrorr   flagsf_contiguoussizerA   c_contiguousgradient_hessianr7   absr   r   r(   gradient_probar'   rB   range)r   r"   r   r8   r3   r.   r4   gradient_outhessian_outr,   r   r    r   r!   r*   r)   rD   rE   nhessrC   hess_pointwisehessian_warningr   Xhprobaswkhlr   r   r   rN     s  7





 

"
&
(


&


0

z LinearModelLoss.gradient_hessianc                    s   j jj\}tj  \}}	
du r |nt
jjsjj	||	
|d\}
}|
 }
| }tj
jd} j|
   |d< jr[|
 |d< | t rrtj|df||fd  n|ddtjf   jrttjddt fdd	}||fS jj||	
|d\}
	|
 }
tjfjd
d}|
j    |dddf< jr|
jdd|dddf<  	
fdd	}jdkr|jd
d|fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        Nr1   r9   r$   r   r   r;   c                    s   t | }t r j| d    |d < nt j j| d  g|d < |d   | d   7  < jr]|d   | d  7  < | d   | d   |d< |S )Nr$   )r   r@   r   r   r   linalg	multi_dotr   )sret)r   hXhX_sumhessian_sumr.   r    r   r   r   hessp  s   

 $  z7LinearModelLoss.gradient_hessian_product.<locals>.hesspr   r:   c                    s  | j dfdd} jr| d d df }| d d d df } nd} | j | }| | jddd d tjf 7 }|9 }d urM|d d tjf 9 }tjf
jdd}|j  	 |   |d d d f< jr||jdd	 |d d df< jdkr|j	ddS |S )Nr$   r   r&   r   r   r;   r:   )
r(   r   r   r?   r   newaxisrA   r   r'   rB   )r`   s_intercepttmp	hess_prod)r   r"   r.   r   r!   r    rY   r3   r   rD   r*   r   r   re     s"   $&
r   r&   )r   r   r   r=   r   r-   r   r?   r   rN   r@   r   r   r   r   r	   rf   squeezeasarray
atleast_1drP   rA   r'   rB   )r   r"   r   r8   r3   r.   r4   r   r)   r,   rC   rV   rE   re   r   )r   r"   rb   rc   rd   r.   r   r!   r    rY   r3   r   rD   r*   r   gradient_hessian_product  sX    


N
" 
z(LinearModelLoss.gradient_hessian_productr   )Nr%   r   N)Nr%   r   NNN)Nr%   r   )__name__
__module____qualname____doc__r   r#   r+   r-   r0   r6   r>   rF   rN   rm   r   r   r   r   r   %   sB    B
' 

;
S
N
  r   )	rq   numpyr   scipyr   utils.extmathr   r   r   r   r   r   r   <module>   s    