Source code for sicor.Tools.cB.classical_bayesian

import h5py
import json
from random import choice, sample
from copy import copy
from time import time
import logging
from tempfile import TemporaryFile
from psutil import virtual_memory
from tqdm import tqdm
from operator import itemgetter
import numpy as np
from numba import jit


def __zeros__(shape, dtype, max_mem_frac=0.3, logger=None):
    logger = logger or logging.getLogger(__name__)

    # noinspection PyShadowingNames
    def in_memory_array(shape, dtype):
        return np.zeros(shape, dtype)

    # noinspection PyShadowingNames
    def out_memory_array(shape, dtype):
        logger.warning("Not enough memory to keep full image -> fall back to memory map.")
        dat = np.memmap(filename=TemporaryFile(mode="w+b"), dtype=dtype, shape=tuple(shape))
        dat[:] = 0.0
        return dat

    to_gb = 1.0 / 1024.0 ** 3
    mem = virtual_memory().total * to_gb
    arr = int(np.prod(np.array(shape, dtype=np.int64)) * np.zeros(1, dtype=dtype).nbytes * to_gb)

    if arr < max_mem_frac * mem:
        try:
            return in_memory_array(shape, dtype)
        except MemoryError:
            return out_memory_array(shape, dtype)
    else:
        logger.info(
            "Try to create array of size %.2fGB on a box with %.2fGB memory -> fall back to memory map." % (
                arr, mem))
        return out_memory_array(shape, dtype)


[docs] def get_clf_functions(): """ this is just an example on how one could define classification functions, this is an argument to the ToClassifier Classes """ return { "ratio": lambda d1, d2: save_divide(d1, d2), "index": lambda d1, d2: save_divide(d1 - d2, d1 + d2), "difference": lambda d1, d2: to_clf(d1) - to_clf(d2), "channel": lambda d: to_clf(d), "depth": lambda d1, d2, d3: save_divide(to_clf(d1) + to_clf(d2), d3), "index_free_diff": lambda d1, d2, d3, d4: save_divide(to_clf(d1) - to_clf(d2), to_clf(d3) - to_clf(d4)), "index_free_add": lambda d1, d2, d3, d4: save_divide(to_clf(d1) + to_clf(d2), to_clf(d3) + to_clf(d4)), }
[docs] def save_divide(d1, d2, mx=100.0): """ save division without introducing NaN's :param d1: :param d2: :param mx: absolute maximum allows value from which on the result is chopped :return: d1/d2 """ dd1 = to_clf(d1) dd2 = to_clf(d2) dd2[dd2 == 0.0] = 1e-6 dd1 /= dd2 dd1 = np.nan_to_num(dd1) dd1[dd1 > mx] = mx dd1[dd1 < -mx] = -mx return dd1
[docs] def to_clf(inp): """ helper function which sets the type of features and assures numeric values :param inp: :return: float array without NaN's or INF's """ return np.nan_to_num(np.array(inp, dtype=float))
[docs] class _ToClassifierBase(object): def __init__(self, logger=None): """ internal base class for generation of classifiers, only to use common __call__ dummy __init__ which sets all basic needed attributes to none, need derived classes to implement proper __init__ :return: """ self.logger = logger or logging.getLogger(__name__) self.n_classifiers = None self.classifiers_fk = None self.classifiers_id = None self.clf_functions = None self.classifiers_id_full = None
[docs] def adjust_classifier_ids(self, full_bands, band_lists): self.classifiers_id = [np.array([band_lists.index(full_bands[ii]) for ii in clf], dtype=int) for clf in self.classifiers_id_full] self.logger.info("""Adjusting classifier channel list indices to actual image, convert from: %s to \n %s. \n This results in a changed classifier index array from:""" % (str(full_bands), str(band_lists))) for func, old, new in zip(self.classifiers_fk, self.classifiers_id_full, self.classifiers_id): self.logger.info("%s : %s -> %s" % (func, old, new))
[docs] @staticmethod def list_np(arr): """ This is fixing a numpy annoyance where scalar arrays and vectors arrays are treated differently, namely one can not iterate over a scalar, this function fixes this in that a python list is returned for both scalar and vector arrays :param arr: numpy array or numpy scalar :return: list with values of arr """ try: return list(arr) except TypeError: # will fail if arr is numpy scalar array return list(arr.reshape(1))
def __call__(self, data): """ Secret sauce of the Classical Bayesian approach in python, here the input data->([n_samples,n_data_channels]) are transformed into ret->([n_samples,n_classifiers]) Iteration is performed over classifiers_fk (name found in clf_functions) and classifiers_id (channel selection from data for this function) :param data: n_samples x n_data_channels :return: res: n_samples x n_classifiers """ # ret = np.zeros((data.shape[0], self.n_classifiers)) # initialize result ret = __zeros__(shape=(data.shape[0], self.n_classifiers), dtype=np.float32) # initialize result for ii, (fn, idx_clf) in enumerate(zip(self.classifiers_fk, self.classifiers_id)): # note that that input of clf_function[fn] is a generator expression where # iteration is performed over the selected classifiers_id's ret[:, ii] = self.clf_functions[fn](*(data[:, ii] for ii in self.list_np(idx_clf))) return ret
# noinspection PyMissingConstructor
[docs] class ToClassifierDef(_ToClassifierBase): def __init__(self, classifiers_id, classifiers_fk, clf_functions, id2name=None, logger=None): """ Most simple case of a usable ToClassifier instance, everything is fixed classifiers_id: list of lists/np.arrays with indices which are inputs for classifier functions classifiers_fk: list of names for the functions to be used clf_functions: dictionary for key, value pairs of function names as used in classifiers_fk """ self.logger = logger or logging.getLogger(__name__) self.n_classifiers = len(classifiers_fk) self.clf_functions = clf_functions self.classifiers_id_full = classifiers_id self.classifiers_id = classifiers_id self.classifiers_fk = classifiers_fk self.id2name = id2name # assert equal length assert len(self.classifiers_id) == self.n_classifiers assert len(self.classifiers_fk) == self.n_classifiers # assert that used functions are in self.clf_functions for cl_fk in self.classifiers_fk: assert cl_fk in self.clf_functions # assert that each value in the dict is a callable for name, func in self.clf_functions.items(): if hasattr(func, "__call__") is False: raise ValueError("Each value in clf_functions should be a callable, error for: %s" % name)
[docs] def read_classical_bayesian_from_hdf5_file(filename): """ loads persistence data for classical Bayesian classifier from hdf5 file :param filename: :return: dictionary needed data """ with h5py.File(filename, 'r') as h5f: kwargs_mk_clf = {name: json.loads(h5f[name][()]) for name in ["classifiers_fk", "classifiers_id"]} kwargs_mk_clf["id2name"] = json.loads(h5f["band_names"][()]) kwargs_cb = {"bns": [h5f[name][()] for name in json.loads(h5f["bns_names"][()])], "hh_full": h5f["hh_full"][()], "hh": {h5f["%s_key" % name][()]: h5f["%s_value" % name][()] for name in json.loads(h5f["hh_names"][()])}, "hh_n": {key: value for key, value in zip(h5f["hh_n_keys"][()], h5f["hh_n_values"][()])}, "n_bins": json.loads(h5f["n_bins"][()]), "classes": h5f["classes"][()], "n_classes": json.loads(h5f["n_classes"][()]), "bb_full": [h5f[name][()] for name in json.loads(h5f["bb_full_names"][()])]} try: mask_legend = {int(key): value for key, value in json.loads(h5f["mask_legend"][()]).items()} mask_legend.update({value: key for key, value in mask_legend.items()}) except KeyError: mask_legend = None try: clf_to_col = {int(key): value for key, value in json.loads(h5f["clf_to_col"][()]).items()} except KeyError: clf_to_col = None return {"kwargs_cb": kwargs_cb, "kwargs_mk_clf": kwargs_mk_clf, "mask_legend": mask_legend, "clf_to_col": clf_to_col }
[docs] def dict_conv(inp): keys, values = [], [] for key, value in inp.items(): keys.append(key) values.append(value) keys = np.array(keys) values = np.array(values) return keys, values
[docs] def write_classical_bayesian_to_hdf5_file(clf, filename, class_names, mask_legend, clf_to_col, band_names): with h5py.File(filename, 'w') as h5f: for name, data in [("class_names", class_names), ("mask_legend", mask_legend), ("classifiers_fk", clf.mk_clf.classifiers_fk), ("classifiers_id", clf.mk_clf.classifiers_id), ("n_bins", clf.n_bins), ("n_classes", clf.n_classes), ("clf_to_col", clf_to_col), ("band_names", band_names), ]: h5f.create_dataset(name, data=json.dumps(data)) h5f.create_dataset("classes", data=clf.classes) keys, values = dict_conv(clf.hh_n) h5f.create_dataset("hh_n_keys", data=keys) h5f.create_dataset("hh_n_values", data=values) h5f.create_dataset("hh_full", data=clf.hh_full, compression="lzf") bns_names = [] for ii, bn in enumerate(clf.bns): name = "bns_%i" % ii bns_names.append(name) h5f.create_dataset(name, data=bn) h5f.create_dataset("bns_names", data=json.dumps(bns_names)) bb_full_names = [] for ii, bn in enumerate(clf.bb_full): name = "bb_full_%i" % ii bb_full_names.append(name) h5f.create_dataset(name, data=bn) h5f.create_dataset("bb_full_names", data=json.dumps(bb_full_names)) names = [] for key, value in clf.hh.items(): name = "hh_%i" % key names.append(name) h5f.create_dataset("%s_key" % name, data=key) h5f.create_dataset("%s_value" % name, data=value, compression="lzf") h5f.create_dataset("hh_names", data=json.dumps(names))
def __test__(clf, xx, yy, sample_weight=None, norm="right_class"): """ Test quality of classifier clf using xx as data and yy as truth values. :param clf: Classifier, needs to implement a predict method :param xx: data, to be classified -> (n_samples,n_data) :param yy: truth data: -> (n_samples,) :param sample_weight: -> weights -> (n_samples) :param norm: string which defined which norm to use, implemented are: "right_class" : only correct classification causes no penalty, norm value is the fraction of correctly classified data "class_distance" : penalty is increasing with increasing class distance, norm value is the normalized mean class distance :return: if sample weight is None: value between 0 and 1, one is best """ from scipy.stats import pearsonr # import here to avoid static TLS ImportError if norm == "right_class": if sample_weight is None: return np.sum(clf.predict(xx) == yy) / float(len(yy)) # ratio of correct else: yy = np.array(clf.predict(xx) == yy, dtype=int) yy[yy == 1] = 1 yy[yy == 0] = -1 return np.sum(yy * sample_weight) elif norm == "class_distance": if sample_weight is None: return 1.0 - np.sum(np.abs(np.array(clf.predict(xx) - yy, dtype=float))) / float( len(yy) * clf.n_classes) else: return 1.0 - np.sum(sample_weight * np.abs(np.array(clf.predict(xx) - yy, dtype=float))) / float( len(yy) * clf.n_classes) elif norm == "scatter": if sample_weight is None: slope, offset, _ = np.polyfit(x=yy, y=clf.predict(xx), deg=1) else: slope, offset, _ = np.polyfit(x=yy, y=clf.predict(xx), deg=1, w=sample_weight) tst = 1.0 - np.abs(1.0 - slope) return tst elif norm == "pearson": if sample_weight is None: tst = pearsonr(x=yy, y=clf.predict(xx))[0] return tst else: raise ValueError("Not implemented") elif norm == "histogram": if sample_weight is None: hh, _, _ = np.histogram2d(clf.predict(xx), yy, bins=clf.n_classes_, normed=True) # tst = np.mean(np.diagonal(hh)/(0.5*(np.sum(hh,axis=0)+np.sum(hh,axis=1)))) tst = 1.0 - np.sum( (1.0 - np.diagonal(hh) / (0.5 * (np.sum(hh, axis=0) + np.sum(hh, axis=1)))) ** 2) / clf.n_classes_ return tst else: raise ValueError("Not implemented") else: raise ValueError("Norm %s not implemented." % norm)
[docs] def histogramdd_fast(data, bins): n_smpl = data.shape[0] n_clf = len(bins) n_bins = len(bins[0]) assigns = np.zeros(n_smpl, dtype=int) for ii, bb in enumerate(bins): assigns += digitize(data[:, ii], bb[1:-1]) * n_bins ** ii bc = np.bincount(assigns, minlength=n_bins ** n_clf).reshape(n_clf * [n_bins]).T return np.array(bc, dtype=float) / n_smpl
[docs] class ClassicalBayesian(object): def __init__(self, mk_clf, bns, hh_full, hh, hh_n, n_bins, classes, n_classes, bb_full, logger=None): """ :param mk_clf: :param bns: :param hh_full: :param hh: :param hh_n: :param n_bins: :param classes: :param n_classes: :param bb_full: :return: """ self.logger = logger or logging.getLogger(__name__) self.mk_clf = mk_clf self.bns = bns self.hh_full = hh_full self.hh = hh self.hh_n = hh_n self.n_bins = n_bins self.classes = classes self.n_classes = n_classes self.bb_full = bb_full self.zm = 0.5 self.gs = 0.5 self.ar_hh_full = {} self.ar_hh = {cl: {} for cl in self.classes} def __in_bounds__(self, ids): ids[ids > self.n_bins - 1] = self.n_bins - 1 def __predict__(self, xx): # import here to avoid static TLS ImportError from scipy.ndimage import zoom from scipy.ndimage import gaussian_filter ids = [digitize(ff, bb) - 1 for ff, bb in zip(self.mk_clf(xx).transpose(), self.bb_full)] tt = 0 for ii in ids: self.__in_bounds__(ii) pp = np.zeros((self.n_classes, len(ids[0])), dtype=float) for ii, cl in enumerate(self.classes): hh = self.hh[cl][tuple(ids)] hh_full = self.hh_full[tuple(ids)] hh_valid = hh_full > 0.0 pp[ii, hh_valid] = hh[hh_valid] / hh_full[hh_valid] / self.n_classes hh_invalid = np.logical_not(hh_valid) t0 = time() if np.sum(hh_invalid) > 0: ar_hh_full, ar_hh = None, None iw = -1 while np.sum(hh_invalid) > 0: iw += 1 try: ar_hh_full = self.ar_hh_full[iw] except KeyError: if ar_hh_full is None and iw == 0: ar_hh_full = np.copy(self.hh_full) self.ar_hh_full[iw] = gaussian_filter(zoom(ar_hh_full, order=1, zoom=self.zm), sigma=self.gs) ar_hh_full = self.ar_hh_full[iw] try: ar_hh = self.ar_hh[cl][iw] except KeyError: if ar_hh is None and iw == 0: ar_hh = np.copy(self.hh[cl]) self.ar_hh[cl][iw] = gaussian_filter(zoom(ar_hh, order=1, zoom=self.zm), self.gs) ar_hh = self.ar_hh[cl][iw] n_bins = ar_hh_full.shape[0] ids_bf = [np.array(id_bf[hh_invalid] * (n_bins / self.n_bins), dtype=int) for id_bf in ids] for ii_bf in ids_bf: ii_bf[ii_bf > n_bins - 1] = n_bins hh_full = ar_hh_full[tuple(ids_bf)] hh = ar_hh[tuple(ids_bf)] good = hh_full != 0.0 hh_ok = np.copy(hh_invalid) hh_ok[hh_ok == np.True_] = good # need element wise comparison here -> use as index field # need element wise comparison here -> use as index field hh_invalid[hh_invalid == np.True_] = np.logical_not(good) pp[ii, hh_ok] = hh[good] / hh_full[good] / self.n_classes self.logger.info("class: %s, bins: %i->%i,curr ok: %i, still bad: %i" % (str(cl), self.n_bins, n_bins, int(np.sum(hh_ok)), int(np.sum(hh_invalid)))) tt += time() - t0 self.logger.info("Time spent in reduced form: %.2f" % tt) return pp
[docs] def predict_proba(self, xx): pr = self.__predict__(xx.reshape((-1, xx.shape[-1]))).transpose() return pr.reshape(list(xx.shape[:-1]) + [pr.shape[-1], ])
[docs] def predict(self, xx): pr = self.classes[np.argmax(self.__predict__(xx.reshape((-1, xx.shape[-1]))), axis=0)] return pr.reshape(xx.shape[:-1])
[docs] def conf(self, xx): proba = self.predict_proba(xx) conf = np.nan_to_num(np.max(proba, axis=1) / np.sum(proba, axis=1)) return conf.reshape(xx.shape[:-1])
[docs] def predict_and_conf(self, xx, bad_data_value=255): proba = self.__predict__(xx.reshape((-1, xx.shape[-1]))).transpose() pr = self.classes[np.argmax(proba, axis=1)] tot = np.sum(proba, axis=1) with np.errstate(invalid='ignore'): # tot might contain zeros -> fix later, is faster that way conf = np.max(proba, axis=1) / tot conf[tot == 0.0] = 0.0 pr[conf == 0.0] = bad_data_value return pr.reshape(xx.shape[:-1]), conf.reshape(xx.shape[:-1])
# noinspection PyMissingConstructor
[docs] class ClassicalBayesianFit(ClassicalBayesian): def __init__(self, mk_clf, smooth_min=0.0, smooth_max=2.0, n_bins_min=5, n_bins_max=20, n_runs=10000, smooth_dd=10, max_mb=100.0, norm="right_class", ff=None, xx=None, yy=None, max_run_time=60, sample_weight=None, use_tqdm=False, sufficient_norm=None, dtype=float, fit_method="random", smooth=None, logger=None): """ ff: function of two variables, combines ff_train,ff_test -> ff which is maximised during fit, default function is: lambda ff_train,ff_test: 0.5 * (ff_train + ff_test) - 0.4 * np.abs(ff_train-ff_test) which has a penalty term for over fitting max_mb: maximum size of a histogram array in MB, if the number of bins or features are leading to higher needed space, the number of bins is reduced to satisfy max_mb """ self.logger = logger or logging.getLogger(__name__) self.fit_method = fit_method self.dtype = dtype assert hasattr(mk_clf, "__call__") if use_tqdm is False: # if tqdm shall not be used, use dummy function self.tqdm = lambda x: x else: self.tqdm = tqdm if ff is None: self.ff = lambda ff_train, ff_test: 0.5 * (ff_train + ff_test) - 0.4 * np.abs(ff_train - ff_test) else: assert hasattr(ff, "__call__") # ff should be a callable self.ff = ff self.norm = norm self.idx = None self.mk_clf = mk_clf self.params = {"mk_clf": self.mk_clf} self.sample_weight = None self.n_bins = None self.classes = None self.n_classes = None self.classes_ = None self.n_classes_ = None self.zm = 0.5 self.gs = 0.5 self.ar_hh_full = {} if hasattr(mk_clf, "__adjust__") is True: self.mk_clf = mk_clf self.n_bins_min = n_bins_min self.n_bins_max = np.min([ n_bins_max, int(np.floor((max_mb / (np.zeros(1, dtype=self.dtype).nbytes / 1024. ** 2)) ** ( 1.0 / self.mk_clf.n_classifiers)))]) assert self.n_bins_max >= self.n_bins_min self.n_runs = n_runs self.max_run_time = max_run_time self.smooth = None self.n_bins = None self.smooth_values = list(np.around( np.linspace(smooth_min, smooth_max, int(smooth_dd)), decimals=int(np.ceil(-1 * np.log10((smooth_max - smooth_min) / smooth_dd))))) self.params.update({"smooth_min": smooth_min, "smooth_max": smooth_max, "smooth_dd": smooth_dd, "n_bins_min": self.n_bins_min, "n_bins_max": self.n_bins_max, "n_runs": self.n_runs, "norm": self.norm, }) self.ff_train = None self.ff_test = None self.ff_res = None if xx is not None and yy is not None: if type(smooth) is dict: n_bins = int(self.n_bins_min + np.random.sample() * (self.n_bins_max - self.n_bins_min)) self.set(xx=xx, yy=yy, sample_weight=sample_weight, n_bins=n_bins, smooth=smooth) else: self.fit(xx=xx, yy=yy, sample_weight=sample_weight, sufficient_norm=sufficient_norm)
[docs] def get_params(self, deep=True): # for skit learn, to get parameters return self.params
[docs] def set_params(self, **params): # for skit learn, to set-up new classifier if "mk_clf" in params: ss = copy(self) ss.__init__(**params) return ss else: for name, value in params.items(): setattr(self, name, value) return self
[docs] def mk_hist(self, xx): from scipy.ndimage import gaussian_filter # import here to avoid static TLS ImportError hh = histogramdd_fast(xx, self.bns) if self.smooth is not None: hh = gaussian_filter(hh, self.smooth) return np.array(hh, dtype=self.dtype), self.bns
[docs] @staticmethod def bins4histeq(inn, nbins_ou=10, nbins_in=1000): """ returns the bin-edges!! for an equalized histogram assumes numpy arrays """ hist, bins = np.histogram(inn.flatten(), nbins_in, normed=True) cdf = hist.cumsum() cdf = cdf / cdf[-1] xxx = np.linspace(0., 1., nbins_ou + 1) return np.interp(xxx, np.hstack((np.array([0]), cdf)), bins)
[docs] def set(self, xx, yy, smooth=None, n_bins=5, sample_weight=None): from sklearn.model_selection import train_test_split # import here to avoid static TLS ImportError self.sample_weight = sample_weight self.n_bins = n_bins self.classes = np.unique(yy) self.ar_hh = {cl: {} for cl in self.classes} self.n_classes = len(self.classes) self.classes_ = self.classes self.n_classes_ = self.n_classes if type(smooth) is dict: xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy, test_size=smooth["test_size"], random_state=42) left = smooth["min"] right = smooth["max"] self.__set__(xx_train, yy_train, smooth=left, sample_weight=sample_weight) tst_left = __test__(self, xx_test, yy_test, sample_weight=sample_weight, norm=self.norm) self.__set__(xx_train, yy_train, smooth=right, sample_weight=sample_weight) tst_right = __test__(self, xx_test, yy_test, sample_weight=sample_weight, norm=self.norm) for i_steps in range(smooth["steps"]): middle = 0.5 * (left + right) self.__set__(xx_train, yy_train, smooth=middle, sample_weight=sample_weight) tst_middle = __test__(self, xx_test, yy_test, sample_weight=sample_weight, norm=self.norm) if (tst_middle + tst_left) > (tst_middle + tst_right): right = copy(middle) tst_right = copy(tst_middle) else: left = copy(middle) tst_left = copy(tst_middle) if tst_left > tst_middle: self.__set__(xx_train, yy_train, smooth=left, sample_weight=sample_weight) if tst_right > tst_middle: self.__set__(xx_train, yy_train, smooth=right, sample_weight=sample_weight) else: # scalar or None tst = self.__set__(xx, yy, smooth=smooth, sample_weight=sample_weight) return tst
def __set__(self, xx, yy, smooth=None, sample_weight=None, test_sample_size=0.05): self.smooth = smooth xx_clf = self.mk_clf(xx) self.bns = [self.bins4histeq(xx, self.n_bins, nbins_in=1000) for xx in xx_clf.transpose()] self.hh_full, self.bb_full = self.mk_hist(xx_clf) self.hh = {} self.hh_n = {} for cl in self.classes: self.hh[cl], _ = self.mk_hist(xx_clf[yy == cl, :]) self.hh_n[cl] = np.sum(yy == cl) if test_sample_size is None: tst = __test__(self, xx, yy, sample_weight=sample_weight, norm=self.norm) else: if self.idx is None: self.idx = np.random.choice(np.arange(xx.shape[0]), np.int(np.ceil(test_sample_size * xx.shape[0])), replace=False) if sample_weight is None: sw = None else: sw = sample_weight[self.idx] tst = __test__(self, xx[self.idx, :], yy[self.idx], sample_weight=sw, norm=self.norm) return tst
[docs] def fit(self, xx, yy, sample_weight=None, **kwargs): # keep xx,yy,sample_weight explicit to comply with sklearn if self.fit_method == "random": self.fit_random(xx, yy, sample_weight=sample_weight, **kwargs) elif self.fit_method == "random_bootstrap": self.fit_random_bootstrap(xx, yy, sample_weight=sample_weight, **kwargs) elif self.fit_method == "chosen_one": self.fit_chosen_one(xx, yy, sample_weight=sample_weight, **kwargs) else: raise ValueError("Method %s not supported." % self.fit_method)
[docs] def fit_chosen_one(self, xx, yy, sample_weight=None, test_size=None, sufficient_norm=None): fraction = choice([0.25, 0.5, 0.75]) n_classifiers_final = self.mk_clf.n_classifiers max_run_time = self.max_run_time self.n_runs = int(fraction * self.n_runs) self.max_run_time = fraction * max_run_time self.mk_clf.n_classifiers = 1 self.mk_clf.n_constant_classifiers = 0 opts_full = self.fit_random(xx, yy, sample_weight=sample_weight, test_size=test_size, sufficient_norm=sufficient_norm, return_opts=True) opts = [] for opt in opts_full: if opt["ff"] not in [oo["ff"] for oo in opts]: opts.append(opt) nn = np.max([int(len(opts) / 10), 1]) choose_from = {"classifiers_id": [oo["classifiers_id"][0] for oo in opts[:nn]], "classifiers_fk": [oo["classifiers_fk"][0] for oo in opts[:nn]], "ff": [oo["ff"] for oo in opts[:nn]]} self.mk_clf.choose_from = choose_from print("Sample", nn, "Fraction:", fraction) print(self.mk_clf.choose_from) self.n_runs = int((1 - fraction) * self.n_runs) self.max_run_time = (1 - fraction) * max_run_time self.mk_clf.n_classifiers = n_classifiers_final self.fit_random(xx, yy, sample_weight=sample_weight, test_size=test_size, sufficient_norm=sufficient_norm, return_opts=True) self.max_run_time = max_run_time
[docs] def fit_random_bootstrap(self, xx, yy, sample_weight=None, test_size=None, sufficient_norm=None): n_classifiers_final = self.mk_clf.n_classifiers n_runs_final = self.n_runs max_run_time = self.max_run_time self.n_runs = int(n_runs_final / n_classifiers_final) + 1 self.max_run_time = max_run_time / n_classifiers_final for ii in range(1, n_classifiers_final + 1): if ii == 1: self.mk_clf.constant_classifiers = {'classifiers_fk': [], 'classifiers_id': []} else: self.mk_clf.constant_classifiers = {'classifiers_fk': self.mk_clf.classifiers_fk, 'classifiers_id': self.mk_clf.classifiers_id} # print(self.mk_clf.constant_classifiers) self.mk_clf.n_classifiers = ii self.mk_clf.n_constant_classifiers = ii - 1 self.mk_clf.__adjust__() self.fit_random(xx, yy, sample_weight=sample_weight, test_size=test_size, sufficient_norm=sufficient_norm) # print(self.mk_clf.classifiers_id,self.mk_clf.classifiers_fk) self.mk_clf.n_classifiers = n_classifiers_final self.n_runs = n_runs_final self.max_run_time = max_run_time
[docs] def fit_random(self, xx, yy, sample_weight=None, test_size=None, sufficient_norm=None, return_opts=False): from sklearn.model_selection import train_test_split # import here to avoid static TLS ImportError if sample_weight is not None: xx_train, xx_test, yy_train, yy_test, sw_train, sw_test = train_test_split(xx, yy, sample_weight, random_state=42) else: xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy, test_size=test_size, random_state=42) sw_train, sw_test = None, None del sample_weight opts = [] ofx = [] ofy = [] t0 = time() for ii in self.tqdm(range(self.n_runs)): smooth = sample(self.smooth_values, 1)[0] n_bins = np.int(self.n_bins_min + np.random.sample() * (self.n_bins_max - self.n_bins_min)) self.mk_clf.__adjust__() ff_train = self.set(xx_train, yy_train, smooth=smooth, n_bins=n_bins, sample_weight=sw_train) ff_test = __test__(self, xx_test, yy_test, sample_weight=sw_test, norm=self.norm) ff = self.ff(ff_train, ff_test) if ff_train > 1.1 * ff_test: ofx.append(n_bins) ofy.append(smooth) opts.append({"ff": np.around(ff, 3), "ii": ii, "smooth": smooth, "ff_test": np.around(ff_test, 3), "ff_train": np.around(ff_train, 3), "n_bins": n_bins, "classifiers_id": self.mk_clf.classifiers_id, "classifiers_fk": self.mk_clf.classifiers_fk}) if sufficient_norm is not None: if ff > sufficient_norm: break if (time() - t0) > self.max_run_time: break opts = sorted(opts, key=itemgetter('ff'), reverse=True) opt = opts[0] try: self.mk_clf.classifiers_id = opt["classifiers_id"] self.mk_clf.classifiers_fk = opt["classifiers_fk"] self.smooth = opt["smooth"] self.n_bins = opt["n_bins"] self.set(xx, yy, smooth=opt["smooth"], n_bins=opt["n_bins"]) self.ff_train, self.ff_test, self.ff_res = opt["ff_train"], opt["ff_test"], opt["ff"] if return_opts is True: return opts else: return opt except KeyError: print("Failed to find a fit for the data")
# do not use 'nopython=True' here since return arrays are created within this function
[docs] @jit(forceobj=True) # equivalent to deprecated nopython=False def digitize(data, bins, max_bins=2000): """ replacement of np.digitize, speed-up with numba """ bin_map = np.zeros(max_bins, np.int32) ret = np.zeros(data.shape[0], dtype=np.int32) bin_nn = len(bin_map) bin_l = bins[0] bin_d = bins[-1] - bins[0] bin_dn = bin_d / bin_nn bi = 0 for ii in range(bin_nn): if bin_l + ii * bin_dn > bins[bi]: bi += 1 bin_map[ii] = bi for ii in range(data.shape[0]): bf = int((data[ii] - bin_l) / bin_dn) if bf < 1: bf = 1 if bf > max_bins - 1: bf = max_bins - 1 bi = bin_map[bf] if data[ii] > bins[bi]: bi += 1 ret[ii] = bi return ret