import numpy as np
from scipy.optimize import nnls
from sklearn.metrics import explained_variance_score
[docs]
def findFurthestPoint(xSearch, xRef):
"""
This function finds a data point in x_search which has the furthest
distance from all data points in xRef. In the case of archetypes,
xRef is the archetypes and xSearch is the dataset.
Note:
In both xSearch and xRef, the columns of the arrays should be the
dimensions and the rows should be the data points.
"""
from scipy.spatial.distance import cdist
D = cdist(xRef, xSearch)
Dsum = D.sum(axis=0)
idxmax = np.argmax(Dsum)
return xSearch[idxmax,:], idxmax
[docs]
def ecdf(X, x):
"""Emperical Cumulative Distribution Function
X:
1-D array. Vector of data points per each feature (dimension), defining
the distribution of data along that specific dimension.
x:
Value. It is the value of the corresponding dimension of an archetype.
P(X <= x):
The cumulative distribution of data points with respect to the archetype
(the probablity or how much of data in a specific dimension is covered
by the archetype).
"""
return float(len(X[X < x]) / len(X))
[docs]
def calcSSE(Xact, Xapx):
"""
This function returns the Sum of Square Errors.
"""
return ((Xact - Xapx) ** 2).sum()
[docs]
def calcSST(Xact):
"""
This function returns the Sum of Square Errors.
"""
return (Xact ** 2).sum()
[docs]
def explainedVariance(Xact, Xapx, method='sklearn'):
if method.lower() == 'sklearn':
return explained_variance_score(Xact.T, Xapx.T)
else:
SSE = calcSSE(Xact, Xapx)
SST = calcSST(Xact)
return (SST - SSE) / SST
[docs]
def solveConstrainedNNLS(u, t, C):
"""
This function solves the typical equation of ||U - TW||^2 where U and T are
defined and W should be determined such that the above expression is
minimised. Further, solution of W is subjected to the following constraints:
Constraint 1: W >= 0
Constraint 2: sum(W) = 1
Note that the above equation is a typical equation in solving alfa's and
beta's.
Solving for ALFA's:
-------------------
when solving for alfa's the following equation should be minimised:
||Xi - sum([alfa]ik x Zk)|| ^ 2.
This equation should be minimised for each data point (i.e. nData is the
number of equations), which results in nData rows of alfa's. In each
equation U, T, and W have the following dimensions:
Equation (i):
U (Xi): It is a 1D-array of nDim x 1 dimension.
T (Z): It is a 2D-array of nDim x k dimension.
W (alfa): It is a 1D-array of k x 1 dimension.
Solving for BETA's:
-------------------
"""
mObservation, nVariables = t.shape
# EMPHASIZING THE CONSTRAINT
u = u * C
t = t * C
# ADDING THE CONSTRAINT EQUATION
u = np.append(u, [1], axis = 0)
t = np.append(t, np.ones([1,nVariables]), axis = 0)
w, rnorm = nnls(t, u)
return w, rnorm
[docs]
def furthestSum(K, noc, i, exclude=[]):
"""
Note by Benyamin Motevalli:
This function was taken from the following address:
https://github.com/ulfaslak/py_pcha
and the original author is: Ulf Aslak Jensen.
"""
"""
Original Note:
Furthest sum algorithm, to efficiently generat initial seed/archetypes.
Note: Commonly data is formatted to have shape (examples, dimensions).
This function takes input and returns output of the transposed shape,
(dimensions, examples).
Parameters
----------
K : numpy 2d-array
Either a data matrix or a kernel matrix.
noc : int
Number of candidate archetypes to extract.
i : int
inital observation used for to generate the FurthestSum.
exclude : numpy.1darray
Entries in K that can not be used as candidates.
Output
------
i : int
The extracted candidate archetypes
"""
def maxIndVal(l):
return max(zip(range(len(l)), l), key=lambda x: x[1])
I, J = K.shape
index = np.array(range(J))
index[exclude] = 0
index[i] = -1
indT = i
sumDist = np.zeros((1, J), np.complex128)
if J > noc * I:
Kt = K
Kt2 = np.sum(Kt**2, axis=0)
for k in range(1, noc + 11):
if k > noc - 1:
Kq = np.dot(Kt[:, i[0]], Kt)
sumDist -= np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[i[0]])
index[i[0]] = i[0]
i = i[1:]
t = np.where(index != -1)[0]
Kq = np.dot(Kt[:, indT].T, Kt)
sumDist += np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[indT])
ind, val = maxIndVal(sumDist[:, t][0].real)
indT = t[ind]
i.append(indT)
index[indT] = -1
else:
if I != J or np.sum(K - K.T) != 0: # Generate kernel if K not one
Kt = K
K = np.dot(Kt.T, Kt)
K = np.lib.scimath.sqrt(
np.tile(np.diag(K), (J, 1)) - 2 * K + \
np.tile(np.mat(np.diag(K)).T, (1, J))
)
Kt2 = np.diag(K) # Horizontal
for k in range(1, noc + 11):
if k > noc - 1:
sumDist -= np.lib.scimath.sqrt(Kt2 - 2 * K[i[0], :] + Kt2[i[0]])
index[i[0]] = i[0]
i = i[1:]
t = np.where(index != -1)[0]
sumDist += np.lib.scimath.sqrt(Kt2 - 2 * K[indT, :] + Kt2[indT])
ind, val = maxIndVal(sumDist[:, t][0].real)
indT = t[ind]
i.append(indT)
index[indT] = -1
return i