Documentation and comments for filter.clj. Not finished yet.
This commit is contained in:
parent
1e8d1d24ec
commit
ecd2e3579f
3 changed files with 192 additions and 63 deletions
|
@ -11,7 +11,55 @@
|
|||
versions so they can be built without having all the dataset instances in memory.
|
||||
|
||||
Functions for evaluating the classifiers built using cross validation or a training
|
||||
set are also provided"
|
||||
set are also provided.
|
||||
|
||||
A sample use of the API for classifiers is shown below:
|
||||
|
||||
(use 'clj-ml.classifiers)
|
||||
|
||||
; Building a classifier using a C4.5 decission tree
|
||||
(def *classifier* (make-classifier :decission-tree :c45))
|
||||
|
||||
; We set the class attribute for the loaded dataset.
|
||||
; *dataset* is supposed to contain a set of instances.
|
||||
(dataset-set-class *dataset* 4)
|
||||
|
||||
; Training the classifier
|
||||
(classifier-train *classifier* *ds*)
|
||||
|
||||
; We evaluate the classifier using a test dataset
|
||||
(def *evaluation* (classifier-evaluate classifier :dataset *dataset* *trainingset*))
|
||||
|
||||
; We retrieve some data from the evaluation result
|
||||
(:kappa *evaluation*)
|
||||
(:root-mean-squared-error *evaluation*)
|
||||
(:precision *evaluation*)
|
||||
|
||||
; A trained classifier can be used to classify new instances
|
||||
(def *to-classify* (make-instance ds {:class :Iris-versicolor
|
||||
:petalwidth 0.2
|
||||
:petallength 1.4
|
||||
:sepalwidth 3.5
|
||||
:sepallength 5.1}))
|
||||
|
||||
; We retrieve the index of the class value assigned by the classifier
|
||||
(classifier-classify *classifier* *to-classify*)
|
||||
|
||||
; We retrieve a symbol with the value assigned by the classifier
|
||||
(classifier-label *classifier* *to-classify*)
|
||||
|
||||
A classifier can also be trained using cross-validation:
|
||||
|
||||
(classifier-evaluate *classifier* :cross-validation ds 10)
|
||||
|
||||
Finally a classifier can be stored in a file for later use:
|
||||
|
||||
(use 'clj-ml.utils)
|
||||
|
||||
(serialize-to-file *classifier*
|
||||
\"/Users/antonio.garrote/Desktop/classifier.bin\")
|
||||
|
||||
"
|
||||
(:use [clj-ml utils data kernel-functions])
|
||||
(:import (java.util Date Random)
|
||||
(weka.classifiers.trees J48)
|
||||
|
@ -42,7 +90,7 @@
|
|||
cols-val-a (check-option-values {:pruning-confidence "-C"
|
||||
:minimum-instances "-M"
|
||||
:pruning-number-folds "-N"
|
||||
:shuffling-random-seed "-Q"}
|
||||
:random-seed "-Q"}
|
||||
map
|
||||
cols-val)]
|
||||
(into-array cols-val-a))))
|
||||
|
@ -69,7 +117,7 @@
|
|||
:momentum "-M"
|
||||
:epochs "-N"
|
||||
:percentage-validation-set "-V"
|
||||
:seed "-S"
|
||||
:random-seed "-S"
|
||||
:threshold-number-errors "-E"}
|
||||
map
|
||||
cols-val)]
|
||||
|
@ -121,26 +169,84 @@
|
|||
This is the description of the supported classifiers and the accepted
|
||||
option parameters for each of them:
|
||||
|
||||
* :decission-tree :c45
|
||||
* :decission-tree :c45
|
||||
|
||||
A classifier building a pruned or unpruned C 4.5 decission tree using
|
||||
Weka J 4.8 implementation.
|
||||
A classifier building a pruned or unpruned C 4.5 decission tree using
|
||||
Weka J 4.8 implementation.
|
||||
|
||||
Parameters:
|
||||
Parameters:
|
||||
|
||||
- :unpruned Use unpruned tree. Sample value: true
|
||||
- :reduce-error-pruning Sample value: true
|
||||
- :only-binary-splits Sample value: true
|
||||
- :no-raising Sample value: true
|
||||
- :no-cleanup Sample value: true
|
||||
- :laplace-smoothing For predicted probabilities. Sample value: true
|
||||
- :pruning-confidence Threshold for pruning. Default value: 0.25
|
||||
- :minimum-instances Minimum number of instances per leave. Default
|
||||
value: 2
|
||||
- :pruning-number-folds Set number of folds for reduced error pruning.
|
||||
Default value: 3
|
||||
- :shuffling-random-seed Seed for random data shuffling. Default value: 1
|
||||
"
|
||||
- :unpruned
|
||||
Use unpruned tree. Sample value: true
|
||||
- :reduce-error-pruning
|
||||
Sample value: true
|
||||
- :only-binary-splits
|
||||
Sample value: true
|
||||
- :no-raising
|
||||
Sample value: true
|
||||
- :no-cleanup
|
||||
Sample value: true
|
||||
- :laplace-smoothing
|
||||
For predicted probabilities. Sample value: true
|
||||
- :pruning-confidence
|
||||
Threshold for pruning. Default value: 0.25
|
||||
- :minimum-instances
|
||||
Minimum number of instances per leave. Default value: 2
|
||||
- :pruning-number-folds
|
||||
Set number of folds for reduced error pruning. Default value: 3
|
||||
- :random-seed
|
||||
Seed for random data shuffling. Default value: 1
|
||||
|
||||
* :bayes :naive
|
||||
|
||||
Classifier based on the Bayes' theorem with strong independence assumptions, among the
|
||||
probabilistic variables.
|
||||
|
||||
Parameters:
|
||||
|
||||
- :kernel-estimator
|
||||
Use kernel desity estimator rather than normal. Sample value: true
|
||||
- :supervised-discretization
|
||||
Use supervised discretization to to process numeric attributes (see :supervised-discretize
|
||||
filter in clj-ml.filters/make-filter function). Sample value: true
|
||||
|
||||
* :neural-network :multilayer-perceptron
|
||||
|
||||
Classifier built using a feedforward artificial neural network with three or more layers
|
||||
of neurons and nonlinear activation functions. It is able to distinguish data that is not
|
||||
linearly separable.
|
||||
|
||||
Parameters:
|
||||
|
||||
- :no-nominal-to-binary
|
||||
A :nominal-to-binary filter will not be applied by default. (see :supervised-nominal-to-binary
|
||||
filter in clj-ml.filters/make-filter function). Default value: false
|
||||
- :no-numeric-normalization
|
||||
A numeric class will not be normalized. Default value: false
|
||||
- :no-nomalization
|
||||
No attribute will be normalized. Default value: false
|
||||
- :no-reset
|
||||
Reseting the network will not be allowed. Default value: false
|
||||
- :learning-rate-decay
|
||||
Learning rate decay will occur. Default value: false
|
||||
- :learning-rate
|
||||
Learning rate for the backpropagation algorithm. Value should be between [0,1].
|
||||
Default value: 0.3
|
||||
- :momentum
|
||||
Momentum rate for the backpropagation algorithm. Value shuld be between [0,1].
|
||||
Default value: 0.2
|
||||
- :epochs
|
||||
Number of iteration to train through. Default value: 500
|
||||
- :percentage-validation-set
|
||||
Percentage size of validation set to use to terminate training. If it is not zero
|
||||
it takes precende over the number of epochs to finish training. Values should be
|
||||
between [0,100]. Default value: 0
|
||||
- :random-seed
|
||||
Value of the seed for the random generator. Values should be longs greater than
|
||||
0. Default value: 0
|
||||
- :threshold-number-errors
|
||||
The consequetive number of errors allowed for validation testing before the network
|
||||
terminates. Values should be greater thant 0. Default value: 20"
|
||||
(fn [kind algorithm & options] [kind algorithm]))
|
||||
|
||||
(defmethod make-classifier [:decission-tree :c45]
|
||||
|
|
|
@ -155,9 +155,12 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :attributes Index of the attributes to be discretized, sample value: [0,4,6]
|
||||
- :invert Invert mathcing sense of the columns, sample value: true
|
||||
- :kononenko Use Kononenko's MDL criterion, sample value: true
|
||||
- :attributes
|
||||
Index of the attributes to be discretized, sample value: [0,4,6]
|
||||
- :invert
|
||||
Invert mathcing sense of the columns, sample value: true
|
||||
- :kononenko
|
||||
Use Kononenko's MDL criterion, sample value: true
|
||||
|
||||
* :unsupervised-discretize
|
||||
|
||||
|
@ -166,19 +169,25 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :attributes Index of the attributes to be discretized, sample value: [0,4,6]
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :unset-class Does not take class attribute into account for the application
|
||||
of the filter, sample-value: true
|
||||
- :attributes
|
||||
Index of the attributes to be discretized, sample value: [0,4,6]
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :unset-class
|
||||
Does not take class attribute into account for the application
|
||||
of the filter, sample-value: true
|
||||
- :binary
|
||||
- :equal-frequency Use equal frequency instead of equal width discretization, sample
|
||||
value: true
|
||||
- :optimize Optmize the number of bins using leave-one-out estimate of
|
||||
estimated entropy. Ingores the :binary attribute. sample value: true
|
||||
- :number-bins Defines the number of bins to divide the numeric attributes into
|
||||
sample value: 3
|
||||
- :equal-frequency
|
||||
Use equal frequency instead of equal width discretization, sample
|
||||
value: true
|
||||
- :optimize
|
||||
Optmize the number of bins using leave-one-out estimate of
|
||||
estimated entropy. Ingores the :binary attribute. sample value: true
|
||||
- :number-bins
|
||||
Defines the number of bins to divide the numeric attributes into
|
||||
sample value: 3
|
||||
|
||||
* :supervised-nominal-to-binary
|
||||
|
||||
|
@ -186,12 +195,15 @@
|
|||
is transformed into k binary attributes if the class is nominal.
|
||||
|
||||
Parameters:
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :also-binary Sets if binary attributes are to be coded as nominal ones, sample value: true
|
||||
- :for-each-nominal For each nominal value one binary attribute is created, not only if the
|
||||
values of the nominal attribute are greater than two.
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :also-binary
|
||||
Sets if binary attributes are to be coded as nominal ones, sample value: true
|
||||
- :for-each-nominal
|
||||
For each nominal value one binary attribute is created, not only if the
|
||||
values of the nominal attribute are greater than two.
|
||||
|
||||
* :unsupervised-nominal-to-binary
|
||||
|
||||
|
@ -199,13 +211,17 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :attributes Index of the attributes to be binarized. Sample value: [1 2 3]
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :also-binary Sets if binary attributes are to be coded as nominal ones, sample value: true
|
||||
- :for-each-nominal For each nominal value one binary attribute is created, not only if the
|
||||
values of the nominal attribute are greater than two., sample value: true
|
||||
- :attributes
|
||||
Index of the attributes to be binarized. Sample value: [1 2 3]
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :also-binary
|
||||
Sets if binary attributes are to be coded as nominal ones, sample value: true
|
||||
- :for-each-nominal
|
||||
For each nominal value one binary attribute is created, not only if the
|
||||
values of the nominal attribute are greater than two., sample value: true
|
||||
|
||||
* :remove-attributes
|
||||
|
||||
|
@ -213,10 +229,12 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :attributes: Index of the attributes to remove. Sample value: [1 2 3]
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :attributes
|
||||
Index of the attributes to remove. Sample value: [1 2 3]
|
||||
|
||||
* :select-append-attributes
|
||||
|
||||
|
@ -224,11 +242,14 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :attributes Index of the attributes to remove. Sample value: [1 2 3]
|
||||
- :invert Invert the selection of the columns. Sample value: [0 1]
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :attributes
|
||||
Index of the attributes to remove. Sample value: [1 2 3]
|
||||
- :invert
|
||||
Invert the selection of the columns. Sample value: [0 1]
|
||||
|
||||
* :project-attributes
|
||||
|
||||
|
@ -236,10 +257,12 @@
|
|||
|
||||
Parameters:
|
||||
|
||||
- :dataset-format The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :invert Invert the selection of columns. Sample value: [0 1]"
|
||||
- :dataset-format
|
||||
The dataset where the filter is going to be applied or a
|
||||
description of the format of its attributes. Sample value:
|
||||
dataset, (dataset-format dataset)
|
||||
- :invert
|
||||
Invert the selection of columns. Sample value: [0 1]"
|
||||
(fn [kind options] kind))
|
||||
|
||||
(defmethod make-filter :supervised-discretize
|
||||
|
@ -285,7 +308,7 @@
|
|||
The :dataset-format attribute for the making of the filter will be setup to the
|
||||
dataset passed as an argument if no other value is provided.
|
||||
|
||||
The application of this filter is equivalent a the consecutive application of
|
||||
The application of this filter is equivalent a the consequetive application of
|
||||
make-filter and apply-filter."
|
||||
[kind options dataset]
|
||||
(let [opts (if (nil? (:dataset-format options)) (conj options {:dataset-format dataset}))
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
(deftest make-classifiers-options-c45
|
||||
(let [options (make-classifier-options :decission-tree :c45 {:unpruned true :reduced-error-pruning true :only-binary-splits true :no-raising true
|
||||
:no-cleanup true :laplace-smoothing true :pruning-confidence 0.12 :minimum-instances 10
|
||||
:pruning-number-folds 5 :shuffling-random-seed 1})]
|
||||
:pruning-number-folds 5 :random-seed 1})]
|
||||
(is (= (aget options 0)
|
||||
""))
|
||||
(is (= (aget options 1)
|
||||
|
|
Loading…
Reference in a new issue