Initial support for clustering algorithms

This commit is contained in:
Antonio Garrote 2010-02-28 20:29:51 +01:00
parent d149acc376
commit a7990c5373
4 changed files with 137 additions and 4 deletions

53
README
View file

@ -31,6 +31,22 @@ the jar manually.
<version>0.0.3-SNAPSHOT</version>
</dependency>
## Supported algorithms
* Filters
- supervised discretize
- unsupervised discretize
- supervised nominal to binary
- unsupervised nominal to binary
* Classifiers
- C4.5 (J4.8)
- naive Bayes
- multilayer perceptron
* Clusterers
- k-means
## Usage
* I/O of data
@ -48,7 +64,7 @@ the jar manually.
REPL>(use 'clj-ml.data)
REPL>; Defining a dataset
REPL>(def ds (make-dataset "name" [:length :width {:kind [:good :bad]}] [12 34 :good] [24 53 :bad] ]))
REPL>(def ds (make-dataset "name" [:length :width {:kind [:good :bad]}] [ [12 34 :good] [24 53 :bad] ]))
REPL>ds
#<ClojureInstances @relation name
@ -193,6 +209,41 @@ the jar manually.
REPL>(serialize-to-file classifier
REPL> "/Users/antonio.garrote/Desktop/classifier.bin")
* Using clusterers
REPL>(use 'clj-ml.clusterers)
REPL> ; we build a clusterer using k-means and three clusters
REPL> (def kmeans (make-clusterer :k-means {:number-clusters 3}))
REPL> ; we need to remove the class from the dataset to
REPL> ; use this clustering algorithm
REPL> (dataset-remove-class ds)
REPL> ; we build the clusters
REPL> (clusterer-build kmeans ds)
REPL> kmeans
#<SimpleKMeans
kMeans
======
Number of iterations: 3
Within cluster sum of squared errors: 7.817456892309574
Missing values globally replaced with mean/mode
Cluster centroids:
Cluster#
Attribute Full Data 0 1 2
(150) (50) (50) (50)
==================================================================================
sepallength 5.8433 5.936 5.006 6.588
sepalwidth 3.054 2.77 3.418 2.974
petallength 3.7587 4.26 1.464 5.552
petalwidth 1.1987 1.326 0.244 2.026
class Iris-setosa Iris-versicolor Iris-setosa Iris-virginica
## License
MIT License

View file

@ -1,5 +1,5 @@
;;
;; Data processing of data with different filtering algorithms
;; Classifiers
;; @author Antonio Garrote
;;
@ -69,7 +69,7 @@
(defmacro make-classifier-m
([kind algorithm classifier-class options]
`(let [options-read# (if (empty? ~options) {} ~options)
`(let [options-read# (if (empty? ~options) {} (first ~options))
classifier# (new ~classifier-class)
opts# (make-classifier-options ~kind ~algorithm options-read#)]
(.setOptions classifier# opts#)

56
src/clj_ml/clusterers.clj Normal file
View file

@ -0,0 +1,56 @@
;;
;; Clusterers
;; @author Antonio Garrote
;;
(ns clj-ml.clusterers
(:use [clj-ml utils data])
(:import (java.util Date Random)
(weka.clusterers SimpleKMeans)))
;; Setting up clusterer options
(defmulti make-clusterer-options
"Creates ther right parameters for a clusterer"
(fn [kind map] kind))
(defmethod make-clusterer-options :k-means
([kind map]
(let [cols-val (check-options {:display-standard-deviation "-V"
:replace-missing-values "-M"
:preserve-instances-order "-O"}
map
[""])
cols-val-a (check-option-values {:number-clusters "-N"
:random-seed "-S"
:number-iterations "-I"}
map
cols-val)]
(into-array cols-val-a))))
;; Building clusterers
(defmacro make-clusterer-m
([kind clusterer-class options]
`(let [options-read# (if (empty? ~options) {} (first ~options))
clusterer# (new ~clusterer-class)
opts# (make-clusterer-options ~kind options-read#)]
(.setOptions clusterer# opts#)
clusterer#)))
(defmulti make-clusterer
"Creates a new clusterer for the given kind algorithm and options"
(fn [kind & options] kind))
(defmethod make-clusterer :k-means
([kind & options]
(make-clusterer-m kind SimpleKMeans options)))
;; Clustering data
(defn clusterer-build
"Applies a clustering algorithm to a set of data"
([clusterer dataset]
(.buildClusterer clusterer dataset)))

View file

@ -98,12 +98,13 @@
;; we haven't received a vector so we create an empty dataset
(new Instances (key-to-str name) (parse-attributes attributes) capacity-or-values))]
;; by default the class is the last attribute in the dataset
(.setClassIndex ds (- (.numAttributes ds) 1))
;; (.setClassIndex ds (- (.numAttributes ds) 1))
ds)))
;; dataset information
(defn dataset-class-values [dataset]
"Returns the possible values for the class attribute"
(let [class-attr (.classAttribute dataset)
values (.enumerateValues class-attr)]
(loop [continue (.hasMoreElements values)
@ -116,6 +117,7 @@
acum))))
(defn dataset-values-at [dataset-or-instance pos]
"Returns the possible values for a nominal attribute at the provided position"
(let [class-attr (.attribute dataset-or-instance pos)
values (.enumerateValues class-attr)]
(if (nil? values)
@ -129,6 +131,24 @@
(conj acum {(keyword val) index})))
acum)))))
(defn dataset-attributes-definition [dataset]
"Returns the definition of the attributes of this dataset"
(let [max (.numAttributes dataset)]
(loop [acum []
c 0]
(if (< c max)
(let [attr (.attribute dataset c)
index c
name (keyword (.name attr))
nominal? (.isNominal attr)
to-add (if nominal?
(let [vals (dataset-values-at dataset index)]
{name (keys vals)})
name)]
(recur (conj acum to-add)
(+ c 1)))
acum))))
;; manipulation of instances
(defn instance-set-class [instance pos]
@ -192,6 +212,12 @@
(do (.setClassIndex dataset pos)
dataset))
(defn dataset-remove-class [dataset]
"Removes the class attribute from the dataset"
(do
(.setClassIndex dataset -1)
dataset))
(defn dataset-count [dataset]
"Returns the number of elements in a dataset"
(.numInstances dataset))