From 279e64e81bd6203b01d940373774004530e4a300 Mon Sep 17 00:00:00 2001 From: Joakim <joakim.j.linja@jyu.fi> Date: Fri, 5 May 2023 20:27:42 +0300 Subject: [PATCH] Uploaded Feature Importance Detector (FID) implementations (Python + and Matlab) related to arXiv:2303.09453 --- fid.m | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fid.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 fid.m create mode 100644 fid.py diff --git a/fid.m b/fid.m new file mode 100644 index 0000000..cb4600d --- /dev/null +++ b/fid.m @@ -0,0 +1,74 @@ +function [rX, rY, f2] = fid(X, Y, cutoff, plotToggle) + % Feature Importance Detector, published in the paper: + % "Knowledge Discovery from Atomic Structures using Feature Importances" (arXiv:2303.09453). + % Description: + % Performs FID feature selection. + % + % Inputs: + % X - Index vector. + % Y - Feature score vector. Assumed to be in descending + % order. + % cutoff - The hyperparameter of FID. Range [0,1]. Determines the number + % of features to be removed. + % plotToggle - Optional boolean input, default is false. + % + % Ouputs: + % rX - Feature selected X + % rY - Feature selected Y + % f2 - Boolean vector, used to from rX and rY out of X and Y. + % + + if exist('plotToggle','var') + plotT = plotToggle; + else + plotT = false; + end + + fun = @(a,b,c,d,x) -a./(b*x+c)+d; + + x1 = X(1); y1 = Y(1); + Y1 = zeros(size(Y)); + + if plotT + figure, plot(X,Y,'LineWidth',2); + title('FID: Input') + end + + Y1(2:end) = (Y(2:end,1)-y1)./(X(2:end,1)-x1); + Y1(1) = Y1(2); + + if plotT + figure, plot(X,Y1,'LineWidth',2); + title('FID: Rolling slope') + end + + fopt = fitoptions('Method','NonLinearLeastSquares','Lower',[0 -Inf -Inf -Inf],'Upper',[Inf Inf Inf Inf],'StartPoint',[1 10 10 0]); + if plotT + disp(fopt) + end + + fitobj = fit(X,Y1,fun,fopt); + + Y2 = fitobj(X); + + if plotT + figure, plot(X,Y2,'LineWidth',2); + title('FID: Curve fit') + end + + dY2 = Y2(end)-Y2(1); + f2 = Y2 < Y2(end) - dY2*cutoff; + + rX = X(f2); + rY = Y(f2); + + if plotT + nf2 = ~f2; + figure, plot(X,Y,'LineWidth',1); + hold on; + nrX = X(nf2); + nrY = Y(nf2); + plot(nrX,nrY,'LineWidth',2); + title('FID: Result') + end +end \ No newline at end of file diff --git a/fid.py b/fid.py new file mode 100644 index 0000000..03ca8ae --- /dev/null +++ b/fid.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Feature Importance Detector presented in +Knowledge Discovery from Atomic Structures using Feature Importances (arXiv:2303.09453) + +@author: Joakim Linja +""" +import numpy as np +import scipy as sp + +def fid(X,Y,cutoff = 0.01): + """ + 1. Bound rolling slope + 2. Curve fit a/(b*x+c)+d + 3. Remove features based on cutoff-defined percentage of total variance + + Parameters + ---------- + + X: numpy.ndarray + Feature score index array in ascending numerical order. + + Y: numpy.ndarray + Feature score array. Assumed to be sorted to descending order. + + cutoff: float, optional, [0,1] + Cutoff value to be used in the feature selection. + + Default: 0.01 + + + Returns + ------- + + rX: numpy.matrix + Feature selected X. + + rY: numpy.matrix + Feature selected Y. + + f2: numpy.ndarray + Boolean indexing array, the feature selection result. + + """ + try: + def f(x,a,b,c,d): + return -a/(b*x+c)+d + + x0 = X[0]; y0 = Y[0] + Y1 = np.zeros(len(Y)) + for i in range(1,len(X)): + Y1[i] = (Y[i]-y0)/(X[i]-x0) + Y1[0] = Y1[1] + + + popt, pcov = sp.optimize.curve_fit(f,X,Y1,p0=[1,10,10,0], + bounds=([0,-np.inf,-np.inf,-np.inf],[np.inf,np.inf,np.inf,np.inf])) + Y2 = f(X,*popt) + + dY2 = Y2[-1]-Y2[0] + f2 = Y2 < Y2[-1] - dY2*cutoff + + rX = X[f2] + rY = Y[f2] + + return rX, rY, f2 + except RuntimeError as e: + raise e -- GitLab