Documentation and comments for filter.clj. Not finished yet.

2010-03-08 09:20:37 +01:00 · 2010-03-08 09:20:37 +01:00 · ecd2e3579f
commit ecd2e3579f
parent 1e8d1d24ec
3 changed files with 192 additions and 63 deletions
--- a/src/clj_ml/classifiers.clj
+++ b/src/clj_ml/classifiers.clj
@ -11,7 +11,55 @@
   versions so they can be built without having all the dataset instances in memory.

   Functions for evaluating the classifiers built using cross validation or a training
-   set are also provided"
+   set are also provided.
+
+   A sample use of the API for classifiers is shown below:
+
+    (use 'clj-ml.classifiers)
+
+    ; Building a classifier using a  C4.5 decission tree
+    (def *classifier* (make-classifier :decission-tree :c45))
+
+    ; We set the class attribute for the loaded dataset.
+    ; *dataset* is supposed to contain a set of instances.
+    (dataset-set-class *dataset* 4)
+
+    ; Training the classifier
+    (classifier-train *classifier* *ds*)
+
+    ; We evaluate the classifier using a test dataset
+    (def *evaluation*   (classifier-evaluate classifier  :dataset *dataset* *trainingset*))
+
+    ; We retrieve some data from the evaluation result
+    (:kappa *evaluation*)
+    (:root-mean-squared-error *evaluation*)
+    (:precision *evaluation*)
+
+    ; A trained classifier can be used to classify new instances
+    (def *to-classify* (make-instance ds  {:class :Iris-versicolor
+                                           :petalwidth 0.2
+                                           :petallength 1.4
+                                           :sepalwidth 3.5
+                                           :sepallength 5.1}))
+
+    ; We retrieve the index of the class value assigned by the classifier
+    (classifier-classify *classifier* *to-classify*)
+
+    ; We retrieve a symbol with the value assigned by the classifier
+    (classifier-label *classifier* *to-classify*)
+
+   A classifier can also be trained using cross-validation:
+
+    (classifier-evaluate *classifier* :cross-validation ds 10)
+
+   Finally a classifier can be stored in a file for later use:
+
+    (use 'clj-ml.utils)
+
+    (serialize-to-file *classifier*
+     \"/Users/antonio.garrote/Desktop/classifier.bin\")
+
+"
  (:use [clj-ml utils data kernel-functions])
  (:import (java.util Date Random)
           (weka.classifiers.trees J48)
@ -42,7 +90,7 @@
           cols-val-a (check-option-values {:pruning-confidence "-C"
                                            :minimum-instances "-M"
                                            :pruning-number-folds "-N"
-                                            :shuffling-random-seed "-Q"}
+                                            :random-seed "-Q"}
                                           map
                                           cols-val)]
    (into-array cols-val-a))))
@ -69,7 +117,7 @@
                                            :momentum "-M"
                                            :epochs "-N"
                                            :percentage-validation-set "-V"
-                                            :seed "-S"
+                                            :random-seed "-S"
                                            :threshold-number-errors "-E"}
                                           map
                                           cols-val)]
@ -121,26 +169,84 @@
   This is the description of the supported classifiers and the accepted
   option parameters for each of them:

-   * :decission-tree :c45
+    * :decission-tree :c45

-     A classifier building a pruned or unpruned C 4.5 decission tree using
-     Weka J 4.8 implementation.
+      A classifier building a pruned or unpruned C 4.5 decission tree using
+      Weka J 4.8 implementation.

-     Parameters:
+      Parameters:

-       - :unpruned Use unpruned tree. Sample value: true
-       - :reduce-error-pruning Sample value: true
-       - :only-binary-splits Sample value: true
-       - :no-raising Sample value: true
-       - :no-cleanup Sample value: true
-       - :laplace-smoothing For predicted probabilities. Sample value: true
-       - :pruning-confidence Threshold for pruning. Default value: 0.25
-       - :minimum-instances Minimum number of instances per leave. Default
-                            value: 2
-       - :pruning-number-folds Set number of folds for reduced error pruning.
-                               Default value: 3
-       - :shuffling-random-seed Seed for random data shuffling. Default value: 1
-    "
+        - :unpruned
+            Use unpruned tree. Sample value: true
+        - :reduce-error-pruning
+            Sample value: true
+        - :only-binary-splits
+            Sample value: true
+        - :no-raising
+            Sample value: true
+        - :no-cleanup
+            Sample value: true
+        - :laplace-smoothing
+            For predicted probabilities. Sample value: true
+        - :pruning-confidence
+            Threshold for pruning. Default value: 0.25
+        - :minimum-instances
+            Minimum number of instances per leave. Default value: 2
+        - :pruning-number-folds
+            Set number of folds for reduced error pruning. Default value: 3
+        - :random-seed
+            Seed for random data shuffling. Default value: 1
+
+    * :bayes :naive
+
+      Classifier based on the Bayes' theorem with strong independence assumptions, among the
+      probabilistic variables.
+
+      Parameters:
+
+        - :kernel-estimator
+            Use kernel desity estimator rather than normal. Sample value: true
+        - :supervised-discretization
+            Use supervised discretization to to process numeric attributes (see :supervised-discretize
+            filter in clj-ml.filters/make-filter function). Sample value: true
+
+    * :neural-network :multilayer-perceptron
+
+      Classifier built using a feedforward artificial neural network with three or more layers
+      of neurons and nonlinear activation functions. It is able to distinguish data that is not
+      linearly separable.
+
+      Parameters:
+
+        - :no-nominal-to-binary
+            A :nominal-to-binary filter will not be applied by default. (see :supervised-nominal-to-binary
+            filter in clj-ml.filters/make-filter function). Default value: false
+        - :no-numeric-normalization
+            A numeric class will not be normalized. Default value: false
+        - :no-nomalization
+            No attribute will be normalized. Default value: false
+        - :no-reset
+            Reseting the network will not be allowed. Default value: false
+        - :learning-rate-decay
+            Learning rate decay will occur. Default value: false
+        - :learning-rate
+            Learning rate for the backpropagation algorithm. Value should be between [0,1].
+            Default value: 0.3
+        - :momentum
+            Momentum rate for the backpropagation algorithm. Value shuld be between [0,1].
+            Default value: 0.2
+        - :epochs
+            Number of iteration to train through. Default value: 500
+        - :percentage-validation-set
+            Percentage size of validation set to use to terminate training. If it is not zero
+            it takes precende over the number of epochs to finish training. Values should be
+            between [0,100]. Default value: 0
+        - :random-seed
+            Value of the seed for the random generator. Values should be longs greater than
+            0. Default value: 0
+        - :threshold-number-errors
+            The consequetive number of errors allowed for validation testing before the network
+            terminates. Values should be greater thant 0. Default value: 20"
  (fn [kind algorithm & options] [kind algorithm]))

 (defmethod make-classifier [:decission-tree :c45]
--- a/src/clj_ml/filters.clj
+++ b/src/clj_ml/filters.clj
@ -155,9 +155,12 @@

      Parameters:

-        - :attributes Index of the attributes to be discretized, sample value: [0,4,6]
-        - :invert Invert mathcing sense of the columns, sample value: true
-        - :kononenko Use Kononenko's MDL criterion, sample value: true
+        - :attributes
+            Index of the attributes to be discretized, sample value: [0,4,6]
+        - :invert
+            Invert mathcing sense of the columns, sample value: true
+        - :kononenko
+            Use Kononenko's MDL criterion, sample value: true

    * :unsupervised-discretize

@ -166,19 +169,25 @@

      Parameters:

-        - :attributes Index of the attributes to be discretized, sample value: [0,4,6]
-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :unset-class Does not take class attribute into account for the application
-                       of the filter, sample-value: true
+        - :attributes
+            Index of the attributes to be discretized, sample value: [0,4,6]
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :unset-class
+            Does not take class attribute into account for the application
+            of the filter, sample-value: true
        - :binary
-        - :equal-frequency Use equal frequency instead of equal width discretization, sample
-                           value: true
-        - :optimize Optmize the number of bins using leave-one-out estimate of
-                    estimated entropy. Ingores the :binary attribute. sample value: true
-        - :number-bins Defines the number of bins to divide the numeric attributes into
-                       sample value: 3
+        - :equal-frequency
+            Use equal frequency instead of equal width discretization, sample
+            value: true
+        - :optimize
+            Optmize the number of bins using leave-one-out estimate of
+            estimated entropy. Ingores the :binary attribute. sample value: true
+        - :number-bins
+            Defines the number of bins to divide the numeric attributes into
+            sample value: 3

    * :supervised-nominal-to-binary

@ -186,12 +195,15 @@
      is transformed into k binary attributes if the class is nominal.

      Parameters:
-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :also-binary Sets if binary attributes are to be coded as nominal ones, sample value: true
-        - :for-each-nominal For each nominal value one binary attribute is created, not only if the
-                            values of the nominal attribute are greater than two.
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :also-binary
+            Sets if binary attributes are to be coded as nominal ones, sample value: true
+        - :for-each-nominal
+            For each nominal value one binary attribute is created, not only if the
+            values of the nominal attribute are greater than two.

    * :unsupervised-nominal-to-binary

@ -199,13 +211,17 @@

      Parameters:

-        - :attributes Index of the attributes to be binarized. Sample value: [1 2 3]
-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :also-binary Sets if binary attributes are to be coded as nominal ones, sample value: true
-        - :for-each-nominal For each nominal value one binary attribute is created, not only if the
-                            values of the nominal attribute are greater than two., sample value: true
+        - :attributes
+            Index of the attributes to be binarized. Sample value: [1 2 3]
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :also-binary
+            Sets if binary attributes are to be coded as nominal ones, sample value: true
+        - :for-each-nominal
+            For each nominal value one binary attribute is created, not only if the
+            values of the nominal attribute are greater than two., sample value: true

    * :remove-attributes

@ -213,10 +229,12 @@

      Parameters:

-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :attributes: Index of the attributes to remove. Sample value: [1 2 3]
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :attributes
+            Index of the attributes to remove. Sample value: [1 2 3]

    * :select-append-attributes

@ -224,11 +242,14 @@

      Parameters:

-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :attributes Index of the attributes to remove. Sample value: [1 2 3]
-        - :invert Invert the selection of the columns. Sample value: [0 1]
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :attributes
+            Index of the attributes to remove. Sample value: [1 2 3]
+        - :invert
+            Invert the selection of the columns. Sample value: [0 1]

    * :project-attributes

@ -236,10 +257,12 @@

      Parameters:

-        - :dataset-format The dataset where the filter is going to be applied or a
-                          description of the format of its attributes. Sample value:
-                          dataset, (dataset-format dataset)
-        - :invert Invert the selection of columns. Sample value: [0 1]"
+        - :dataset-format
+            The dataset where the filter is going to be applied or a
+            description of the format of its attributes. Sample value:
+            dataset, (dataset-format dataset)
+        - :invert
+            Invert the selection of columns. Sample value: [0 1]"
  (fn [kind options] kind))

 (defmethod make-filter :supervised-discretize
@ -285,7 +308,7 @@
   The :dataset-format attribute for the making of the filter will be setup to the
   dataset passed as an argument if no other value is provided.

-   The application of this filter is equivalent a the consecutive application of
+   The application of this filter is equivalent a the consequetive application of
   make-filter and apply-filter."
  [kind options dataset]
  (let [opts (if (nil? (:dataset-format options)) (conj options {:dataset-format dataset}))
--- a/test/clj_ml/classifiers_test.clj
+++ b/test/clj_ml/classifiers_test.clj
@ -6,7 +6,7 @@
 (deftest make-classifiers-options-c45
  (let [options (make-classifier-options :decission-tree :c45 {:unpruned true :reduced-error-pruning true :only-binary-splits true :no-raising true
                                                               :no-cleanup true :laplace-smoothing true :pruning-confidence 0.12 :minimum-instances 10
-                                                               :pruning-number-folds 5 :shuffling-random-seed 1})]
+                                                               :pruning-number-folds 5 :random-seed 1})]
    (is (= (aget options 0)
           ""))
    (is (= (aget options 1)