Documentation for clusterers and small fixes in the clusterer options for :expectation-maximization clusterer

2010-03-16 09:22:49 +01:00 · 2010-03-16 09:22:49 +01:00 · 47e78f5fb4
commit 47e78f5fb4
parent 06adde17e9
4 changed files with 115 additions and 11 deletions
--- a/src/clj_ml/classifiers.clj
+++ b/src/clj_ml/classifiers.clj
@ -156,7 +156,7 @@
   The first argument identifies the kind of classifier and the second
   argument the algorithm to use, e.g. :decission-tree :c45.

-   The colection of classifiers currently supported are:
+   The classifiers currently supported are:

     - :decission-tree :c45
     - :bayes :naive
@ -459,7 +459,23 @@
   The function returns the newly classified instance.

   This call is destructive, the instance passed as an argument
-   is modified."
+   is modified.
+
+    ; We create the instance to classify
+    (def *to-classify* (make-instance *dataset*  {:class :Iris-versicolor
+                                                  :petalwidth 0.2
+                                                  :petallength 1.4
+                                                  :sepalwidth 3.5
+                                                  :sepallength 5.1}))
+
+    ; We use the classifier to check the value for the class
+    (classifier-classify *classifier* *to-classify*)
+     >0.0
+
+    ; We change the class for the instance according to the assigned class
+    (classifier-label *classifier* *to-classify*)
+     >#<Instance 5.1,3.5,1.4,0.2,Iris-setosa>
+"
  ([classifier instance]
     (let [cls (classifier-classify classifier instance)]
       (instance-set-class instance cls))))
--- a/src/clj_ml/clusterers.clj
+++ b/src/clj_ml/clusterers.clj
@ -7,11 +7,12 @@
  clj-ml.clusterers
  "This namespace contains several functions for
   building clusterers using different clustering algorithms. K-means, Cobweb and
+   Expectation maximization algorithms are currently supported.

-   Expectation maximization algorithms are currently supported. Some of these
-   algorithms support incremental building of the clustering without having the
-   full data set in main memory. Functions for evaluating the clusterer as well
-   as for clustering new instances are also supported"
+   Some of these algorithms support incremental building of the clustering without
+   having the full data set in main memory. Functions for evaluating the clusterer
+   as well as for clustering new instances are also supported
+"
  (:use [clj-ml utils data distance-functions]
        [incanter charts])
  (:import (java.util Date Random)
@ -20,7 +21,8 @@

 ;; Setting up clusterer options

-(defmulti make-clusterer-options
+(defmulti #^{:skip-wiki true}
+  make-clusterer-options
  "Creates ther right parameters for a clusterer"
  (fn [kind map] kind))

@ -51,8 +53,9 @@

 (defmethod make-clusterer-options :expectation-maximization
  ([kind map]
-     (let [cols-val-a (check-option-values {:acuity "-A"
-                                            :cutoff "-C"
+     (let [cols-val-a (check-option-values {:number-clusters "-N"
+                                            :maximum-iterations "-I"
+                                            :minimum-standard-deviation "-M"
                                            :random-seed "-S"}
                                           map
                                           [""])]
@ -78,7 +81,66 @@
        clusterer#)))

 (defmulti make-clusterer
-  "Creates a new clusterer for the given kind algorithm and options"
+  "Creates a new clusterer for the given kind algorithm and options.
+
+   The first argument identifies the kind of clusterer. The second argument
+   is a map of parameters particular to each clusterer.
+
+   The clusterers currently supported are:
+     - :k-means
+     - :cobweb
+     - :expectation-maximization
+
+   This is the description of the supported clusterers and the parameters accepted
+   by each clusterer algorithm:
+
+     * :k-means
+
+       A clusterer that uses the simple K-Means algorithm to build the clusters
+
+       Parameters:
+
+         - :display-standard-deviation
+             Display the standard deviation of the centroids in the output for the
+             clusterer. Sample value: true
+         - :replace-missing-values
+             Replaces the missing values with the mean/mode. Sample value: true
+         - :number-clusters
+             The number of clusters to be built. Sample value: 3
+         - :random-seed
+             Seed for the random number generator. Sample value: 0.3
+         - :number-iterations
+             Maximum number of iterations that the algorithm will run. Sample value: 1000
+
+     * :cobweb
+
+       Implementation of the Cobweb incremental algorithm for herarchical conceptual clustering.
+
+       Parameters:
+
+         - :acuity
+             Acuity. Default value: 1.0
+         - :cutoff
+             Cutoff. Default value: 0.002
+         - :random-seed
+             Seed for the random number generator. Default value: 42.
+
+     * :expectation-maximization
+
+       Implementation of the probabilistic clusterer algorithm for expectation maximization.
+
+       Parameters:
+
+         - :number-clusters
+             Number of clusters to be built. If ommitted or -1 is passed as a value, cross-validation
+             will be used to select the number of clusters. Sample value: 3
+         - :maximum-iterations
+             Maximum number of iterations the algorithm will run. Default value: 100
+         - :minimum-standard-deviation
+             Minimum allowable standard deviation for normal density computation. Default value: 1e-6
+         - :random-seed
+             Seed for the random number generator. Default value: 100
+   "
  (fn [kind & options] kind))


--- a/src/clj_ml/data_store.clj
+++ b/src/clj_ml/data_store.clj
@ -29,7 +29,12 @@
        format))))

 (defmulti make-data-store-connection
-  "Connects to a data store"
+  "Connects to a data store.
+
+   - The first parameter is the kind of data store to connect to.
+   - The second parameter is a map with options for the connection
+     to that kind of data store.
+"
  (fn [kind params] kind))

 (defmethod make-data-store-connection :mongodb
--- a/test/clj_ml/clusterers_test.clj
+++ b/test/clj_ml/clusterers_test.clj
@ -26,6 +26,27 @@
    (is (= (aget options 9)
           "1"))))

+(deftest make-clusterers-options-expectation-maximization
+  (let [options (make-clusterer-options :expectation-maximization {:number-clusters 3 :maximum-iterations 10 :minimum-standard-deviation 0.001 :random-seed 30})]
+    (is (= (aget options 0)
+           ""))
+    (is (= (aget options 1)
+           "-N"))
+    (is (= (aget options 2)
+           "3"))
+    (is (= (aget options 3)
+           "-I"))
+    (is (= (aget options 4)
+           "10"))
+    (is (= (aget options 5)
+           "-M"))
+    (is (= (aget options 6)
+           "0.0010"))
+    (is (= (aget options 7)
+           "-S"))
+    (is (= (aget options 8)
+           "30"))))
+

 (deftest make-and-build-clusterer
  (let [ds (make-dataset :test [:a :b] [[1 2] [3 4]])