Source code for iteraa.utils

import numpy as np
from scipy.optimize import nnls
from sklearn.metrics import explained_variance_score




[docs]
def findFurthestPoint(xSearch, xRef):
    """
    This function finds a data point in x_search which has the furthest 
    distance from all data points in xRef. In the case of archetypes,
    xRef is the archetypes and xSearch is the dataset.
    
    Note:
        In both xSearch and xRef, the columns of the arrays should be the
        dimensions and the rows should be the data points.
    """
    from scipy.spatial.distance import cdist
    D = cdist(xRef, xSearch)
    Dsum = D.sum(axis=0)
    idxmax = np.argmax(Dsum)
    return xSearch[idxmax,:], idxmax




[docs]
def ecdf(X, x):
    """Emperical Cumulative Distribution Function
    
    X: 
        1-D array. Vector of data points per each feature (dimension), defining
        the distribution of data along that specific dimension.
        
    x:
        Value. It is the value of the corresponding dimension of an archetype.
        
    P(X <= x):
        The cumulative distribution of data points with respect to the archetype
        (the probablity or how much of data in a specific dimension is covered
        by the archetype).
    """
    return float(len(X[X < x]) / len(X))




[docs]
def calcSSE(Xact, Xapx):
    """
    This function returns the Sum of Square Errors.
    """
    return ((Xact - Xapx) ** 2).sum()




[docs]
def calcSST(Xact):
    """
    This function returns the Sum of Square Errors.
    """
    return (Xact ** 2).sum()




[docs]
def explainedVariance(Xact, Xapx, method='sklearn'):
    if method.lower() == 'sklearn':  
        return explained_variance_score(Xact.T, Xapx.T)
    else:
        SSE = calcSSE(Xact, Xapx)
        SST = calcSST(Xact)
        return (SST - SSE) / SST

        
        

[docs]
def solveConstrainedNNLS(u, t, C):
    """
    This function solves the typical equation of ||U - TW||^2 where U and T are
    defined and W should be determined such that the above expression is 
    minimised. Further, solution of W is subjected to the following constraints:
        
        Constraint 1:       W >= 0
        Constraint 2:       sum(W) = 1
    
    Note that the above equation is a typical equation in solving alfa's and
    beta's.
    
    
    Solving for ALFA's:
    -------------------
    when solving for alfa's the following equation should be minimised:
        
        ||Xi - sum([alfa]ik x Zk)|| ^ 2.
    
    This equation should be minimised for each data point (i.e. nData is the
    number of equations), which results in nData rows of alfa's. In each 
    equation U, T, and W have the following dimensions:
        
    Equation (i):
        
        U (Xi):         It is a 1D-array of nDim x 1 dimension.
        T (Z):          It is a 2D-array of nDim x k dimension.
        W (alfa):       It is a 1D-array of k x 1 dimension.
    
    
    Solving for BETA's:
    -------------------   
    """
    mObservation, nVariables = t.shape
    # EMPHASIZING THE CONSTRAINT
    u = u * C
    t = t * C   
    # ADDING THE CONSTRAINT EQUATION
    u = np.append(u, [1], axis = 0)
    t = np.append(t, np.ones([1,nVariables]), axis = 0)
    w, rnorm = nnls(t, u)
    return w, rnorm




[docs]
def furthestSum(K, noc, i, exclude=[]):
    """
    Note by Benyamin Motevalli:
        
        This function was taken from the following address:
            https://github.com/ulfaslak/py_pcha
            
        and the original author is: Ulf Aslak Jensen.
    """
    
    """
    Original Note:
        
    Furthest sum algorithm, to efficiently generat initial seed/archetypes.

    Note: Commonly data is formatted to have shape (examples, dimensions).
    This function takes input and returns output of the transposed shape,
    (dimensions, examples).
    
    Parameters
    ----------
    K : numpy 2d-array
        Either a data matrix or a kernel matrix.

    noc : int
        Number of candidate archetypes to extract.

    i : int
        inital observation used for to generate the FurthestSum.

    exclude : numpy.1darray
        Entries in K that can not be used as candidates.

    Output
    ------
    i : int
        The extracted candidate archetypes
    """
    def maxIndVal(l):
        return max(zip(range(len(l)), l), key=lambda x: x[1])

    I, J = K.shape
    index = np.array(range(J))
    index[exclude] = 0
    index[i] = -1
    indT = i
    sumDist = np.zeros((1, J), np.complex128)

    if J > noc * I:
        Kt = K
        Kt2 = np.sum(Kt**2, axis=0)
        for k in range(1, noc + 11):
            if k > noc - 1:
                Kq = np.dot(Kt[:, i[0]], Kt)
                sumDist -= np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[i[0]])
                index[i[0]] = i[0]
                i = i[1:]
            t = np.where(index != -1)[0]
            Kq = np.dot(Kt[:, indT].T, Kt)
            sumDist += np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[indT])
            ind, val = maxIndVal(sumDist[:, t][0].real)
            indT = t[ind]
            i.append(indT)
            index[indT] = -1
    else:
        if I != J or np.sum(K - K.T) != 0:  # Generate kernel if K not one
            Kt = K
            K = np.dot(Kt.T, Kt)
            K = np.lib.scimath.sqrt(
                np.tile(np.diag(K), (J, 1)) - 2 * K + \
                np.tile(np.mat(np.diag(K)).T, (1, J))
            )

        Kt2 = np.diag(K)  # Horizontal
        for k in range(1, noc + 11):
            if k > noc - 1:
                sumDist -= np.lib.scimath.sqrt(Kt2 - 2 * K[i[0], :] + Kt2[i[0]])
                index[i[0]] = i[0]
                i = i[1:]
            t = np.where(index != -1)[0]
            sumDist += np.lib.scimath.sqrt(Kt2 - 2 * K[indT, :] + Kt2[indT])
            ind, val = maxIndVal(sumDist[:, t][0].real)
            indT = t[ind]
            i.append(indT)
            index[indT] = -1
    return i