1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26 from mvpa import _random_seed
27
28
29 from mvpa.base import externals, warning
30 if externals.exists('shogun', raiseException=True):
31 import shogun.Features
32 import shogun.Classifier
33 import shogun.Regression
34 import shogun.Kernel
35 import shogun.Library
36
37
38 if hasattr(shogun.Kernel, 'M_DEBUG'):
39 _M_DEBUG = shogun.Kernel.M_DEBUG
40 _M_ERROR = shogun.Kernel.M_ERROR
41 elif hasattr(shogun.Kernel, 'MSG_DEBUG'):
42 _M_DEBUG = shogun.Kernel.MSG_DEBUG
43 _M_ERROR = shogun.Kernel.MSG_ERROR
44 else:
45 _M_DEBUG, _M_ERROR = None, None
46 warning("Could not figure out debug IDs within shogun. "
47 "No control over shogun verbosity would be provided")
48
49 try:
50
51 shogun.Library.Math_init_random(_random_seed)
52
53
54 shogun.Library.Math_init_random(_random_seed)
55 except Exception, e:
56 warning('Shogun cannot be seeded due to %s' % (e,))
57
58 import operator
59
60 from mvpa.misc.param import Parameter
61 from mvpa.base import warning
62
63 from mvpa.clfs.base import FailedToTrainError
64 from mvpa.clfs.meta import MulticlassClassifier
65 from mvpa.clfs._svmbase import _SVM
66 from mvpa.misc.state import StateVariable
67 from mvpa.measures.base import Sensitivity
68
69 from sens import *
70
71 if __debug__:
72 from mvpa.base import debug
73
74
76 """Helper to set level of debugging output for SG
77 :Parameters:
78 obj
79 In SG debug output seems to be set per every object
80 partname : basestring
81 For what kind of object we are talking about... could be automated
82 later on (TODO)
83 """
84 if _M_DEBUG is None:
85 return
86 debugname = "SG_%s" % partname.upper()
87
88 switch = {True: (_M_DEBUG, 'M_DEBUG', "enable"),
89 False: (_M_ERROR, 'M_ERROR', "disable")}
90
91 key = __debug__ and debugname in debug.active
92
93 sglevel, slevel, progressfunc = switch[key]
94
95 if __debug__:
96 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
97 (partname, `obj`, slevel))
98 obj.io.set_loglevel(sglevel)
99 try:
100 exec "obj.io.%s_progress()" % progressfunc
101 except:
102 warning("Shogun version installed has no way to enable progress" +
103 " reports")
104
105
107 """Draft helper function to convert data we have into SG suitable format
108
109 TODO: Support different datatypes
110 """
111
112 if __debug__:
113 debug("SG_", "Converting data for shogun into RealFeatures")
114
115 features = shogun.Features.RealFeatures(data.astype('double').T)
116
117 if __debug__:
118 debug("SG__", "Done converting data for shogun into RealFeatures")
119 _setdebug(features, 'Features')
120 return features
121
122
124 """Support Vector Machine Classifier(s) based on Shogun
125
126 This is a simple base interface
127 """
128
129 num_threads = Parameter(1,
130 min=1,
131 doc='Number of threads to utilize')
132
133
134 _KERNELS = {}
135 if externals.exists('shogun', raiseException=True):
136 _KERNELS = { "linear": (shogun.Kernel.LinearKernel,
137 ('scale',), LinearSVMWeights),
138 "rbf" : (shogun.Kernel.GaussianKernel,
139 ('gamma',), None),
140 "rbfshift": (shogun.Kernel.GaussianShiftKernel,
141 ('gamma', 'max_shift', 'shift_step'), None),
142 "sigmoid": (shogun.Kernel.SigmoidKernel,
143 ('cache_size', 'gamma', 'coef0'), None),
144 }
145
146 _KNOWN_PARAMS = [ 'epsilon' ]
147 _KNOWN_KERNEL_PARAMS = [ ]
148
149 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
150
151 if externals.exists('sg ge 0.6.4'):
152 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
153
154
155
156 """
157 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
158 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
159 with standard additive bias, but will L2 reqularize it - though it
160 should not matter much in practice (although it will give slightly
161 different solutions)). Note that SGD has no stopping criterion (you
162 simply have to specify the number of iterations) and that OCAS has a
163 different stopping condition than svmlight for example which may be more
164 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
165 for epsilon.
166
167 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
168 depending on the problem one is faster than the other (hard to say when,
169 I *think* when your dataset is very unbalanced chunking methods like
170 svmlight/gpdt are better), for smaller problems definitely libsvm.
171
172 If you use string kernels then gpdt/svmlight have a special 'linadd'
173 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
174 code for python-modular before that). This is effective for big datasets
175 and (I trained on 10 million strings based on this).
176
177 And yes currently we only implemented parallel training for svmlight,
178 however all SVMs can be evaluated in parallel.
179 """
180 _KNOWN_IMPLEMENTATIONS = {}
181 if externals.exists('shogun', raiseException=True):
182 _KNOWN_IMPLEMENTATIONS = {
183 "libsvm" : (shogun.Classifier.LibSVM, ('C',),
184 ('multiclass', 'binary'),
185 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
186 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',),
187 ('multiclass', 'binary'),
188 "Generalized Nearest Point Problem SVM"),
189
190 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
191 "Gradient Projection Decomposition Technique for " \
192 "large-scale SVM problems"),
193 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
194 "Generalized Nearest Point Problem SVM"),
195
196
197
198
199
200
201
202
203
204
205 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',),
206 ('regression',),
207 "LIBSVM's epsilon-SVR"),
208 }
209
210
211 - def __init__(self,
212 kernel_type='linear',
213 **kwargs):
214 """Interface class to Shogun's classifiers and regressions.
215
216 Default implementation is 'libsvm'.
217 """
218
219 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
220 kwargs['svm_impl'] = svm_impl
221
222
223 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
224
225 self.__svm = None
226 """Holds the trained svm."""
227
228
229
230
231 self.__traindataset = None
232
233
234 self.__traindata = None
235 self.__kernel = None
236 self.__kernel_test = None
237 self.__testdata = None
238
239
241
242
243
244 if self._svm_impl in ['svrlight', 'lightsvm']:
245 try:
246 kernel.set_precompute_matrix(True, True)
247 except Exception, e:
248
249 if __debug__:
250 debug('SG_', "Failed call to set_precompute_matrix for %s: %s"
251 % (self, e))
252
253
255 """Train SVM
256 """
257
258
259 newkernel, newsvm = False, False
260
261 retrainable = self.params.retrainable
262
263 if retrainable:
264 _changedData = self._changedData
265
266
267 ul = None
268 self.__traindataset = dataset
269
270
271
272
273
274
275 if __debug__:
276 debug("SG_", "Creating labels instance")
277
278 if 'regression' in self._clf_internals:
279 labels_ = N.asarray(dataset.labels, dtype='double')
280 else:
281 ul = dataset.uniquelabels
282 ul.sort()
283
284 if len(ul) == 2:
285
286 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
287 elif len(ul) < 2:
288 raise FailedToTrainError, \
289 "We do not have 1-class SVM brought into SG yet"
290 else:
291
292 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
293
294
295 _labels_dict_rev = dict([(x[1], x[0])
296 for x in _labels_dict.items()])
297
298
299 self._labels_dict = _labels_dict
300 self._labels_dict_rev = _labels_dict_rev
301
302
303
304
305
306 if __debug__:
307 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
308 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
309
310 labels = shogun.Features.Labels(labels_)
311 _setdebug(labels, 'Labels')
312
313
314
315 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
316
317
318 kargs = []
319 for arg in self._KERNELS[self._kernel_type_literal][1]:
320 value = self.kernel_params[arg].value
321
322 if arg == 'gamma' and value == 0.0:
323 value = self._getDefaultGamma(dataset)
324 kargs += [value]
325
326 if retrainable and __debug__:
327 if _changedData['traindata']:
328 debug("SG",
329 "Re-Creating kernel since training data has changed")
330
331 if _changedData['kernel_params']:
332 debug("SG",
333 "Re-Creating kernel since params %s has changed" %
334 _changedData['kernel_params'])
335
336
337 if __debug__: debug("SG_", "Converting input data for shogun")
338 self.__traindata = _tosg(dataset.samples)
339
340 if __debug__:
341 debug("SG", "Creating kernel instance of %s giving arguments %s" %
342 (`self._kernel_type`, kargs))
343
344 self.__kernel = kernel = \
345 self._kernel_type(self.__traindata, self.__traindata,
346 *kargs)
347
348 if externals.exists('sg ge 0.6.4'):
349 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
350
351 newkernel = True
352 self.kernel_params.reset()
353 _setdebug(kernel, 'Kernels')
354
355 self.__condition_kernel(kernel)
356 if retrainable:
357 if __debug__:
358 debug("SG_", "Resetting test kernel for retrainable SVM")
359 self.__kernel_test = None
360 self.__kernel_args = kargs
361
362
363
364 Cs = None
365 if not retrainable or self.__svm is None or _changedData['params']:
366
367 if self.params.isKnown('C'):
368 C = self.params.C
369 if not operator.isSequenceType(C):
370
371 C = [C]
372
373 Cs = list(C[:])
374 for i in xrange(len(Cs)):
375 if Cs[i]<0:
376 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
377 if __debug__:
378 debug("SG_", "Default C for %s was computed to be %s" %
379 (C[i], Cs[i]))
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401 svm_impl_class = self.__get_implementation(ul)
402
403 if __debug__:
404 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
405
406 if self._svm_impl in ['libsvr', 'svrlight']:
407
408 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
409 elif self._svm_impl in ['krr']:
410 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
411 else:
412 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
413 self.__svm.set_epsilon(self.params.epsilon)
414
415
416 if self.params.isKnown('shrinking'):
417 shrinking = self.params.shrinking
418 if __debug__:
419 debug("SG_", "Setting shrinking to %s" % shrinking)
420 self.__svm.set_shrinking_enabled(shrinking)
421
422 if Cs is not None and len(Cs) == 2:
423 if __debug__:
424 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
425 self.__svm.set_C(Cs[0], Cs[1])
426
427 self.params.reset()
428 newsvm = True
429 _setdebug(self.__svm, 'SVM')
430
431 if self.params.isKnown('tube_epsilon') and \
432 hasattr(self.__svm, 'set_tube_epsilon'):
433 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
434 self.__svm.parallel.set_num_threads(self.params.num_threads)
435 else:
436 if __debug__:
437 debug("SG_", "SVM instance is not re-created")
438 if _changedData['labels']:
439 if __debug__: debug("SG__", "Assigning new labels")
440 self.__svm.set_labels(labels)
441 if newkernel:
442 if __debug__: debug("SG__", "Assigning new kernel")
443 self.__svm.set_kernel(self.__kernel)
444 assert(_changedData['params'] is False)
445
446 if retrainable:
447
448 self.states.retrained = not newsvm or not newkernel
449
450
451 if __debug__ and 'SG' in debug.active:
452 if not self.regression:
453 lstr = " with labels %s" % dataset.uniquelabels
454 else:
455 lstr = ""
456 debug("SG", "%sTraining %s on data%s" %
457 (("","Re-")[retrainable and self.states.retrained],
458 self, lstr))
459
460 self.__svm.train()
461
462 if __debug__:
463 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
464
465
466 if (__debug__ and 'SG__' in debug.active) or \
467 self.states.isEnabled('training_confusion'):
468 try:
469
470 trained_labels = self.__svm.apply().get_labels()
471 except AttributeError:
472
473 trained_labels = self.__svm.classify().get_labels()
474 else:
475 trained_labels = None
476
477 if __debug__ and "SG__" in debug.active:
478 debug("SG__", "Original labels: %s, Trained labels: %s" %
479 (dataset.labels, trained_labels))
480
481
482
483
484
485
486
487
488
489 if self.regression and self.states.isEnabled('training_confusion'):
490 self.states.training_confusion = self._summaryClass(
491 targets=dataset.labels,
492 predictions=trained_labels)
493
495 """Predict values for the data
496 """
497
498 retrainable = self.params.retrainable
499
500 if retrainable:
501 changed_testdata = self._changedData['testdata'] or \
502 self.__kernel_test is None
503
504 if not retrainable or changed_testdata:
505 testdata = _tosg(data)
506
507 if not retrainable:
508 if __debug__:
509 debug("SG__",
510 "Initializing SVMs kernel of %s with training/testing samples"
511 % self)
512
513 self.__kernel.init(self.__traindata, testdata)
514 self.__condition_kernel(self.__kernel)
515 else:
516 if changed_testdata:
517 if __debug__:
518 debug("SG__",
519 "Re-creating testing kernel of %s giving "
520 "arguments %s" %
521 (`self._kernel_type`, self.__kernel_args))
522 kernel_test = self._kernel_type(self.__traindata, testdata,
523 *self.__kernel_args)
524 _setdebug(kernel_test, 'Kernels')
525
526 custk_args = ([self.__traindata, testdata], [])[
527 int(externals.exists('sg ge 0.6.4'))]
528 if __debug__:
529 debug("SG__",
530 "Re-creating custom testing kernel giving "
531 "arguments %s" % (str(custk_args)))
532 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
533
534 _setdebug(kernel_test_custom, 'Kernels')
535 self.__kernel_test = kernel_test_custom
536 self.__kernel_test.set_full_kernel_matrix_from_full(
537 kernel_test.get_kernel_matrix())
538 elif __debug__:
539 debug("SG__", "Re-using testing kernel")
540
541 assert(self.__kernel_test is not None)
542 self.__svm.set_kernel(self.__kernel_test)
543
544 if __debug__:
545 debug("SG_", "Classifying testing data")
546
547
548
549 try:
550
551 values_ = self.__svm.apply()
552 except AttributeError:
553
554 values_ = self.__svm.classify()
555 if values_ is None:
556 raise RuntimeError, "We got empty list of values from %s" % self
557
558 values = values_.get_labels()
559
560 if retrainable:
561
562 self.states.repredicted = repredicted = not changed_testdata
563 if __debug__:
564 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
565 % repredicted)
566
567 self.__svm.set_kernel(self.__kernel)
568
569 if __debug__:
570 debug("SG__", "Got values %s" % values)
571
572 if ('regression' in self._clf_internals):
573 predictions = values
574 else:
575
576 _labels_dict = self._labels_dict
577 _labels_dict_rev = self._labels_dict_rev
578
579 if len(_labels_dict) == 2:
580 predictions = 1.0 - 2*N.signbit(values)
581 else:
582 predictions = values
583
584
585 label_type = type(_labels_dict.values()[0])
586
587
588 predictions = [_labels_dict_rev[label_type(x)]
589 for x in predictions]
590
591 if __debug__:
592 debug("SG__", "Tuned predictions %s" % predictions)
593
594
595
596
597 self.values = values
598
599
600 if not retrainable:
601 try:
602 testdata.free_features()
603 except:
604 pass
605
606 return predictions
607
608
610 super(SVM, self).untrain()
611 if not self.params.retrainable:
612 if __debug__:
613 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
614 msgargs={'clf':self})
615
616
617
618 if True:
619 if True:
620
621 if self.__kernel is not None:
622 del self.__kernel
623 self.__kernel = None
624
625 if self.__kernel_test is not None:
626 del self.__kernel_test
627 self.__kernel_test = None
628
629 if self.__svm is not None:
630 del self.__svm
631 self.__svm = None
632
633 if self.__traindata is not None:
634
635
636
637
638 self.__traindata.free_features()
639 del self.__traindata
640 self.__traindata = None
641
642 self.__traindataset = None
643
644
645
646
647
648 if __debug__:
649 debug("SG__",
650 "Done untraining %(self)s and destroying sg's SVM",
651 msgargs=locals())
652 elif __debug__:
653 debug("SG__", "Not untraining %(self)s since it is retrainable",
654 msgargs=locals())
655
656
658 if 'regression' in self._clf_internals or len(ul) == 2:
659 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
660 else:
661 if self._svm_impl == 'libsvm':
662 svm_impl_class = shogun.Classifier.LibSVMMultiClass
663 elif self._svm_impl == 'gmnp':
664 svm_impl_class = shogun.Classifier.GMNPSVM
665 else:
666 raise RuntimeError, \
667 "Shogun: Implementation %s doesn't handle multiclass " \
668 "data. Got labels %s. Use some other classifier" % \
669 (self._svm_impl, self.__traindataset.uniquelabels)
670 if __debug__:
671 debug("SG_", "Using %s for multiclass data of %s" %
672 (svm_impl_class, self._svm_impl))
673
674 return svm_impl_class
675
676
677 svm = property(fget=lambda self: self.__svm)
678 """Access to the SVM model."""
679
680 traindataset = property(fget=lambda self: self.__traindataset)
681 """Dataset which was used for training
682
683 TODO -- might better become state variable I guess"""
684
685
686
687
688
689 for name, item, params, descr in \
690 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
691 "MPD classifier from shogun"),
692 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
693 "SVMLight classification http://svmlight.joachims.org/"),
694 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
695 "SVMLight regression http://svmlight.joachims.org/"),
696 ('krr', "shogun.Regression.KRR", "('tau',), ('regression',)",
697 "Kernel Ridge Regression"),
698 ]:
699 if externals.exists('shogun.%s' % name):
700 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
701
702
703 LinearSVMWeights._LEGAL_CLFS = [SVM]
704