gatapia/py_ml_utils

may you check results for introspect_sklearn.py

Opened this issue · 0 comments

after slight change results are
is it working as expected?


          GradientBoostingRegressor    0.8295(+-0.0387)      0.07m
                ExtraTreesRegressor    0.8220(+-0.0485)      0.07m
                   BaggingRegressor    0.7951(+-0.0435)      0.08m
              RandomForestRegressor    0.7938(+-0.0446)      0.08m
                  AdaBoostRegressor    0.7448(+-0.0311)      0.07m
                 ExtraTreeRegressor    0.7230(+-0.0636)      0.06m
                   LinearRegression    0.6812(+-0.0225)      0.06m
                            RidgeCV    0.6806(+-0.0228)      0.07m
                        LassoLarsCV    0.6795(+-0.0218)      0.06m
                               Lars    0.6765(+-0.0181)      0.07m
                              Ridge    0.6759(+-0.0235)      0.07m
                             LarsCV    0.6701(+-0.0199)      0.06m
                        LassoLarsIC    0.6678(+-0.0297)      0.06m
                      BayesianRidge    0.6652(+-0.0215)      0.07m
                      ARDRegression    0.6623(+-0.0239)      0.10m
              DecisionTreeRegressor    0.6592(+-0.0570)      0.06m
                            LassoCV    0.6310(+-0.0220)      0.06m
                  TheilSenRegressor    0.6257(+-0.0236)      0.09m
        OrthogonalMatchingPursuitCV    0.6247(+-0.0297)      0.06m
                         ElasticNet    0.6225(+-0.0200)      0.06m
                              Lasso    0.6146(+-0.0208)      0.06m
                          LinearSVR    0.6071(+-0.0385)      0.07m
                       ElasticNetCV    0.6036(+-0.0230)      0.06m
                     HuberRegressor    0.5641(+-0.0184)      0.07m
                    GaussianProcess    0.5381(+-0.0321)      0.07m
          OrthogonalMatchingPursuit    0.5126(+-0.0178)      0.06m
                KNeighborsRegressor    0.4587(+-0.0687)      0.07m
                    RANSACRegressor    0.4418(+-0.0801)      0.07m
                          LassoLars   -0.0140(+-0.0056)      0.06m
                                SVR   -0.0204(+-0.0183)      0.06m
                              NuSVR   -0.0331(+-0.0231)      0.06m
                       MLPRegressor   -0.4991(+-0.1958)      0.07m
         PassiveAggressiveRegressor   -1.5901(+-0.8248)      0.06m
           GaussianProcessRegressor   -5.5622(+-0.2112)      0.07m
                    GaussianMixture  -36.3100(+-0.1037)      0.08m
            BayesianGaussianMixture  -36.4831(+-0.1084)      0.06m
                       SGDRegressor-740255963447501177432834048.0000(+-721608378456158132773060608.0000)      0.07m
[{'name': 'GradientBoostingRegressor', 'score': 0.8294675950620558, 'sem': 0.038697277160232024, 'took': 0.06543353001276651}, {'name': 'ExtraTreesRegressor', 'score': 0.8219932768079022, 'sem': 0.04854419090176812, 'took': 0.07448914845784506}, {'name': 'BaggingRegressor', 'score': 0.7951410144472711, 'sem': 0.04345846631050684, 'took': 0.07726001342137655}, {'name': 'RandomForestRegressor', 'score': 0.7938016078449767, 'sem': 0.04461968326284518, 'took': 0.07662835915883383}, {'name': 'AdaBoostRegressor', 'score': 0.7447621032864981, 'sem': 0.031141049023248508, 'took': 0.07199073632558187}, {'name': 'ExtraTreeRegressor', 'score': 0.7230142892238222, 'sem': 0.06363436756236208, 'took': 0.05977343320846558}, {'name': 'LinearRegression', 'score': 0.6811986578778075, 'sem': 0.022541292446391012, 'took': 0.06334726413091024}, {'name': 'RidgeCV', 'score': 0.68062737759137, 'sem': 0.02279257522997783, 'took': 0.0659236749013265}, {'name': 'LassoLarsCV', 'score': 0.6795132272324104, 'sem': 0.021822428857612756, 'took': 0.06067108710606893}, {'name': 'Lars', 'score': 0.6765114810873696, 'sem': 0.018053891175682096, 'took': 0.06525879303614299}, {'name': 'Ridge', 'score': 0.6759086828183462, 'sem': 0.023513062050043766, 'took': 0.07132590214411418}, {'name': 'LarsCV', 'score': 0.6700565280793053, 'sem': 0.019870880775850946, 'took': 0.06492633024851481}, {'name': 'LassoLarsIC', 'score': 0.6678156115866583, 'sem': 0.02974701929494405, 'took': 0.06329733530680338}, {'name': 'BayesianRidge', 'score': 0.6651651366473242, 'sem': 0.021496925505518193, 'took': 0.06761914094289144}, {'name': 'ARDRegression', 'score': 0.6623191491579764, 'sem': 0.023881799569544726, 'took': 0.09591010411580404}, {'name': 'DecisionTreeRegressor', 'score': 0.6591940650045099, 'sem': 0.057038232161180744, 'took': 0.06289846499760945}, {'name': 'LassoCV', 'score': 0.6310094141412307, 'sem': 0.02202262657221789, 'took': 0.06336384216944377}, {'name': 'TheilSenRegressor', 'score': 0.6257405152620453, 'sem': 0.02361953451321954, 'took': 0.09195405642191569}, {'name': 'OrthogonalMatchingPursuitCV', 'score': 0.6246585582406875, 'sem': 0.029688398100226585, 'took': 0.060205602645874025}, {'name': 'ElasticNet', 'score': 0.6224674699789806, 'sem': 0.019954922816295302, 'took': 0.0626823623975118}, {'name': 'Lasso', 'score': 0.6146425212051473, 'sem': 0.02082079523461588, 'took': 0.06097027063369751}, {'name': 'LinearSVR', 'score': 0.6071304632076193, 'sem': 0.03851599977382228, 'took': 0.07217363119125367}, {'name': 'ElasticNetCV', 'score': 0.6035630964040316, 'sem': 0.023006730065823905, 'took': 0.06472687323888143}, {'name': 'HuberRegressor', 'score': 0.5641180697854233, 'sem': 0.018390157383013184, 'took': 0.06544162034988403}, {'name': 'GaussianProcess', 'score': 0.5381169234593121, 'sem': 0.03210310436991657, 'took': 0.07119294007619222}, {'name': 'OrthogonalMatchingPursuit', 'score': 0.5126458830625192, 'sem': 0.017778179873312067, 'took': 0.0615354061126709}, {'name': 'KNeighborsRegressor', 'score': 0.4587164178672096, 'sem': 0.06868620796189272, 'took': 0.06512523492177327}, {'name': 'RANSACRegressor', 'score': 0.441831451671089, 'sem': 0.08014354503424943, 'took': 0.07194092671076456}, {'name': 'LassoLars', 'score': -0.014021478460640205, 'sem': 0.0056452484821212226, 'took': 0.05821096499760946}, {'name': 'SVR', 'score': -0.020417639894389383, 'sem': 0.018253507675483005, 'took': 0.05942434867223104}, {'name': 'NuSVR', 'score': -0.03306591529425958, 'sem': 0.023059489071394088, 'took': 0.0626325249671936}, {'name': 'MLPRegressor', 'score': -0.49910976628295933, 'sem': 0.195822539180817, 'took': 0.06647159655888875}, {'name': 'PassiveAggressiveRegressor', 'score': -1.590134321109338, 'sem': 0.8248219779802669, 'took': 0.059042084217071536}, {'name': 'GaussianProcessRegressor', 'score': -5.562202602883623, 'sem': 0.21117314890579691, 'took': 0.07112640937169393}, {'name': 'GaussianMixture', 'score': -36.30995722491594, 'sem': 0.10368584439449659, 'took': 0.07692754666010539}, {'name': 'BayesianGaussianMixture', 'score': -36.4831297985314, 'sem': 0.10839611693811571, 'took': 0.06073753039042155}, {'name': 'SGDRegressor', 'score': -7.402559634475012e+26, 'sem': 7.216083784561581e+26, 'took': 0.06557461420694986}]

changed code is

from __future__ import print_function, absolute_import

import inspect, warnings, sklearn, psutil, numpy, re, time
import numpy as np
from misc import * # July29
from OverridePredictFunctionClassifier import * #July29

from sklearn import cluster, covariance, \
  decomposition, ensemble, feature_extraction, feature_selection, \
  gaussian_process, isotonic, kernel_approximation,  learning_curve, \
  linear_model, manifold, mixture, multiclass, naive_bayes, \
  neighbors, neural_network, cross_decomposition, preprocessing, \
   random_projection, semi_supervised, svm, tree, datasets # July29 remmoved lda, qda,

def get_python_processes():
  def is_python_process(p):
    try: return 'python' in p.name
    except: return false
  #return len([p for p in psutil.get_process_list() if is_python_process]) # July29 process_iter()
  return len([p for p in psutil.process_iter() if is_python_process])

def get_classifiers(module=None, done=[]):
  if module is None: module = sklearn
  ignores = ['MemmapingPool', 'PicklingPool', 'externals', 
    'datasets', 'EllipticEnvelope', 'OneClassSVM']
  classifiers = []
  X, y = sklearn.datasets.make_regression(20, 5)
  for name, cls in inspect.getmembers(module):                
    if name in ignores: continue

    if inspect.ismodule(cls):       
      if cls.__name__.startswith('_') or \
          cls.__name__.endswith('_') or \
          not cls.__name__.startswith('sklearn') or\
          cls.__name__ in done or \
          any([t in ignores for t in cls.__name__.split('.')]): continue
      done.append(cls.__name__)
      classifiers += get_classifiers(cls, done)      

    if inspect.isclass(cls):             
      if '_' in name or name[0].islower(): continue
      if cls.__module__.startswith('_') or \
          cls.__module__.endswith('_') or \
          not cls.__module__.startswith('sklearn'): continue
      
      pre_processes_length = get_python_processes()
      full_name = cls.__module__ + '.' + cls.__name__
      if full_name in done: continue
      done.append(full_name)      
      try: cls().fit(X, y).predict(X)
      except: cls = None

      post_processes_length = get_python_processes()
      diff = post_processes_length - pre_processes_length
      #July29 if diff > 1: raise Exception('After[%s] Processes increased by: %s' % (full_name, diff))

      if cls: classifiers.append(cls)
  return classifiers

all_scores = []
cached_classifiers = None

def try_all_classifiers(X, y, classifiers=None, scoring=None, 
    ignore=[], classification=None, use_proba=False, classifier_transform=None):
  global all_scores, cached_classifiers
  all_scores = []
  if classifiers is None: 
    print('calling get_classifiers')    
    if cached_classifiers is None:
      classifiers = get_classifiers(sklearn)
      cached_classifiers = classifiers
    else:
      classifiers = cached_classifiers
    print('got ' + repr(len(classifiers)) + ' classifiers')

  for classifier in classifiers:    
    if classifier.__name__ in ignore: continue    
    try:
      print('testing classifier:', classifier.__name__)
      t0 = time.time()
      clf = classifier()
      if classification == True and not isinstance(clf, sklearn.base.ClassifierMixin): 
        print('is classification and classifier is not a ClassifierMixin')
        continue
      if classification == False and not isinstance(clf, sklearn.base.RegressorMixin): 
        print('is NOT classification and classifier is not a RegressorMixin')
        continue
      if hasattr(clf, 'n_estimators'): clf.n_estimators = 200
      if use_proba and not hasattr(clf, 'predict_proba'):
        func = 'decision_function' if hasattr(clf, 'decision_function') else 'predict'
        clf = OverridePredictFunctionClassifier(clf, func)      
      if classifier_transform is not None: clf = classifier_transform(clf)
        
      score, sem = do_cv(clf, X.copy(), y, len(y), n_iter=3, scoring=scoring, quiet=True)
      took = (time.time() - t0) / 60.
      all_scores.append({'name':classifier.__name__, 'score': score, 'sem': sem, 'took': took})      
      print('classifier:', classifier.__name__, 'score:', score, 'sem:', sem, 'took: %.1fm' % took)
    except Exception as e: #July29
      print('classifier:', classifier.__name__, 'error - not included in results - ' + str(e))
  all_scores = sorted(all_scores, key=lambda t: t['score'], reverse=True) 
  my_print_results = map(lambda d: '{:>35}{:10.4f}(+-{:5.4f}){:10.2f}m'.format(d['name'], d['score'], d['sem'], d['took']), all_scores) # July29
  
  # July29 print('\t\tsuccessfull classifiers\n', '\n').join( map(lambda d: '{:>35}{:10.4f}(+-{:5.4f}){:10.2f}m'.format(d['name'], d['score'], d['sem'], d['took']), all_scores))
  print('\t\tsuccessfull classifiers\n', '\n')
  [print(x) for x in my_print_results]
  print (all_scores)

def parse_classifier_meta(classifier):
  doc = classifier.__doc__
  lines = filter(None, [s.strip() for s in re.sub('-+', '\n', doc).split('\n')])
  args = []
  started = False
  curr_arg = None
  for l in lines:
    if not started and l == 'Parameters': started = True
    elif started and l == 'See Also': break
    elif started:
      if ':' in l: 
        name_type = map(lambda s: s.strip(), l.split(':'))
        curr_arg = { 'name': name_type[0], 'type': name_type[1], 'description': '' }
        args.append(curr_arg)
      elif l:
        if not curr_arg: print('invalid line [%s] doc: %s' % (l, doc))
        curr_arg['description'] += l
  return {'classifier': classifier, 'args': args }

def parse_float_type(t):
  q = '.* ([0-9.]+) \< .* \< ([0-9.]+)'
  r = re.search(q, t)
  if r: return np.linspace(float(r.group(1)), float(r.group(2)), 100)
  return np.linspace(-100, 100, 100)

def parse_range_type(t):
  matches = re.findall('([A-z0-9\.]+)', t, re.DOTALL)
  matches = [None if m == 'None' else m for m in matches]
  matches = [True if m == 'True' else m for m in matches]
  matches = [False if m == 'False' else m for m in matches]
  l = list(set(matches))
  #l.sort() July29
  return l

def parse_string_type(t, d):
  d = d.replace('\n', ' ')
  matches = re.findall('[\'"]([A-z0-9]+)[\'"]', t + ' ' + d, re.DOTALL)    
  l = list(set(matches))
  #l.sort() July29
  return l

def get_val_for_type(name, t, desc):
  ignores_names = ['base_estimator', 'class_weight']

  if name in ignores_names or not(t) or 'array' in t: return None
  if t.startswith('class') or t.startswith('ref') \
    or t.startswith('meth') or t.startswith('callable'): return None
  if name.startswith('_') or name.endswith('_'): return None

  if (t.startswith('bool') or t.startswith('Bool')): return [True, False]
  if (t.startswith('float')): return parse_float_type(t)  
  if (t.startswith('int')): return range(0, 2000, 10)
  if (t.startswith('str')): return parse_string_type(t, desc)
  if (t.startswith('{')): return parse_range_type(t)  
  if (t.startswith('double')): return np.linspace(-100, 100, 100)
  return None

def try_classifier_with_arg_customisation(meta):
  clf = meta['classifier']
  gs_args = {}
  for a in meta['args']:    
    vals = get_val_for_type(a['name'], a['type'], a['description'])
    if vals != None: gs_args[a['name']] = vals
  if (1==2 and len(gs_args) > 0):
    boston_data = datasets.load_boston()
    X = boston_data['data']
    y = boston_data['target']
    do_gs(clf(), X, y, gs_args)




if __name__ == '__main__':
  boston_data = datasets.load_boston()
  X = boston_data['data']
  y = boston_data['target']
  try_all_classifiers(X, y)
  # metas = [parse_classifier_meta(clf) for clf in classifiers]
  # ignore = [test_classifier_with_arg_customisation(m) for m in metas]