First commit

This commit is contained in:
Antonio Garrote 2010-02-28 13:14:17 +01:00
commit 360e507bd2
11 changed files with 1216 additions and 0 deletions

180
README.markdown Normal file
View file

@ -0,0 +1,180 @@
# clj-ml
A machine learning library for Clojure built on top of Weka and friends
## Usage
* I/O of data
Loading data from a CSV file:
(use 'clj-ml.io)
; Loading data from an ARFF file, XRFF and CSV are also supported
(def ds (load-instances :arff "file:///Applications/weka-3-6-2/data/iris.arff"))
; Saving data in a different format
(save-instances :csv ds)
* Working with datasets
(use 'clj-ml.data)
; Defining a dataset
(def ds (make-dataset ; name of the dataset
"name"
; two numeric attributes and one nominal
[:length :width {:kind [:good :bad]}]
; initial data
[ [12 34 :good]
[24 53 :bad] ]))
ds
>#<ClojureInstances @relation name
>
>@attribute length numeric
>@attribute width numeric
>@attribute kind {good,bad}
>
>@data
>12,34,good
>24,53,bad>
; Using datasets like sequences
(dataset-seq ds)
>(#<Instance 12,34,good> #<Instance 24,53,bad>)
; Transforming instances into maps or vectors
(instance-to-map (first (dataset-seq ds)))
>{:kind :good, :width 34.0, :length 12.0}
(instance-to-vector (dataset-at ds 0))
* Filtering datasets
(us 'clj-ml.filters)
(def ds (load-instances :arff
"file:///Applications/weka-3-6-2/data/iris.arff"))
; Discretizing a numeric attribute using an unsupervised filter
(def discretize (make-filter :unsupervised-discretize
{:dataset *ds*
:attributes [0 2]}))
(def filtered-ds (filter-process discretize ds))
* Using classifiers
(use 'clj-ml.classifiers)
; Building a classifier using a C4.5 decission tree
(def classifier (make-classifier :decission-tree :c45))
; We set the class attribute for the loaded dataset
(dataset-set-class ds 4)
; Training the classifier
(classifier-train classifier ds)
>#<J48 J48 pruned tree
>------------------
>
>petalwidth <= 0.6: Iris-setosa (50.0)
>petalwidth > 0.6
>| petalwidth <= 1.7
>| | petallength <= 4.9: Iris-versicolor (48.0/1.0)
>| | petallength > 4.9
>| | | petalwidth <= 1.5: Iris-virginica (3.0)
>| | | petalwidth > 1.5: Iris-versicolor (3.0/1.0)
>| petalwidth > 1.7: Iris-virginica (46.0/1.0)
>
>Number of Leaves : 5
>
>Size of the tree : 9
; We evaluate the classifier using a test dataset
; last parameter should be a different test dataset, here we are using the same
(def evaluation (classifier-evaluate classifier :dataset ds ds))
>=== Confusion Matrix ===
>
> a b c <-- classified as
> 50 0 0 | a = Iris-setosa
> 0 49 1 | b = Iris-versicolor
> 0 2 48 | c = Iris-virginica
>
>=== Summary ===
>
>Correctly Classified Instances 147 98 %
>Incorrectly Classified Instances 3 2 %
>Kappa statistic 0.97
>Mean absolute error 0.0233
>Root mean squared error 0.108
>Relative absolute error 5.2482 %
>Root relative squared error 22.9089 %
>Total Number of Instances 150
(:kappa evaluation)
>0.97
(:root-mean-squared-error e)
>0.10799370769526968
(:precision e)
>{:Iris-setosa 1.0, :Iris-versicolor 0.9607843137254902, :Iris-virginica
0.9795918367346939}
; The classifier can also be evaluated using cross-validation
(classifier-evaluate classifier :cross-validation ds 10)
>=== Confusion Matrix ===
>
> a b c <-- classified as
> 49 1 0 | a = Iris-setosa
> 0 47 3 | b = Iris-versicolor
> 0 4 46 | c = Iris-virginica
>
>=== Summary ===
>
>Correctly Classified Instances 142 94.6667 %
>Incorrectly Classified Instances 8 5.3333 %
>Kappa statistic 0.92
>Mean absolute error 0.0452
>Root mean squared error 0.1892
>Relative absolute error 10.1707 %
>Root relative squared error 40.1278 %
>Total Number of Instances 150
; A trained classifier can be used to classify new instances
(def to-classify (make-instance ds
{:class :Iris-versicolor,
:petalwidth 0.2,
:petallength 1.4,
:sepalwidth 3.5,
:sepallength 5.1}))
(classifier-classify classifier to-classify)
> 0.0
(classifier-label to-classify)
>#<Instance 5.1,3.5,1.4,0.2,Iris-setosa>
; The classifiers can be saved and restored later
(use 'clj-ml.utils)
(serialize-to-file classifier
"/Users/antonio.garrote/Desktop/classifier.bin")
## Installation
In order to install the library you must first install Leiningen.
You should also download the Weka 3.6.2 jar from the official weka homepage.
If maven complains about not finding weka, follow its instructions to install
the jar manually.
### To install from source
* git clone the project
* $ lein deps
* $ lein compile
* $ lein compile-java
* $ lein uberjar
## License
MIT License

8
project.clj Normal file
View file

@ -0,0 +1,8 @@
(defproject clj-ml "0.0.3-SNAPSHOT"
:description "Machine Learning library for Clojure"
:java-source-path "src/java"
:javac-fork "true"
:dependencies [[org.clojure/clojure "1.1.0"]
[org.clojure/clojure-contrib "1.1.0"]
[lein-javac "0.0.2-SNAPSHOT"]
[weka/weka "3.6.2"]])

188
src/clj_ml/classifiers.clj Normal file
View file

@ -0,0 +1,188 @@
;;
;; Data processing of data with different filtering algorithms
;; @author Antonio Garrote
;;
(ns clj-ml.classifiers
(:use [clj-ml utils data])
(:import (java.util Date Random)
(weka.classifiers.trees J48)
(weka.classifiers.bayes NaiveBayes)
(weka.classifiers.functions MultilayerPerceptron)
(weka.classifiers Evaluation)))
;; Setting up classifier options
(defmulti make-classifier-options
"Creates the right parameters for a classifier"
(fn [kind algorithm map] [kind algorithm]))
(defmethod make-classifier-options [:decission-tree :c45]
([kind algorithm map]
(let [cols-val (check-options {:unpruned "-U"
:reduced-error-pruning "-R"
:only-binary-splits "-B"
:no-raising "-S"
:no-cleanup "-L"
:laplace-smoothing "-A"}
map
[""])
cols-val-a (check-option-values {:pruning-confidence "-C"
:minimum-instances "-M"
:pruning-number-folds "-N"
:shuffling-random-seed "-Q"}
map
cols-val)]
(into-array cols-val-a))))
(defmethod make-classifier-options [:bayes :naive]
([kind algorithm map]
(let [cols-val (check-options {:kernel-estimator "-K"
:supervised-discretization "-D"
:old-format "-O"}
map
[""])]
(into-array cols-val))))
(defmethod make-classifier-options [:neural-network :multilayer-perceptron]
([kind algorithm map]
(let [cols-val (check-options {:no-nominal-to-binary "-B"
:no-numeric-normalization "-C"
:no-normalization "-I"
:no-reset "-R"
:learning-rate-decay "-D"}
map
[""])
cols-val-a (check-option-values {:learning-rate "-L"
:momentum "-M"
:epochs "-N"
:percentage-validation-set "-V"
:seed "-S"
:threshold-number-errors "-E"}
map
cols-val)]
(into-array cols-val-a))))
;; Building classifiers
(defmacro make-classifier-m
([kind algorithm classifier-class options]
`(let [options-read# (if (empty? ~options) {} ~options)
classifier# (new ~classifier-class)
opts# (make-classifier-options ~kind ~algorithm options-read#)]
(.setOptions classifier# opts#)
classifier#)))
(defmulti make-classifier
"Creates a new classifier for the given kind algorithm and options"
(fn [kind algorithm & options] [kind algorithm]))
(defmethod make-classifier [:decission-tree :c45]
([kind algorithm & options]
(make-classifier-m kind algorithm J48 options)))
(defmethod make-classifier [:bayes :naive]
([kind algorithm & options]
(make-classifier-m kind algorithm NaiveBayes options)))
(defmethod make-classifier [:neural-network :multilayer-perceptron]
([kind algorithm & options]
(make-classifier-m kind algorithm MultilayerPerceptron options)))
;; Training classifiers
(defn classifier-train
"Trains a classifier with the given dataset as the training data"
([classifier dataset]
(do (.buildClassifier classifier dataset)
classifier)))
;; Evaluating classifiers
(defn- try-metric [f]
(try (f)
(catch Exception ex {:nan (.getMessage ex)})))
(defn- try-multiple-values-metric [class-values f]
(loop [acum {}
ks (keys class-values)]
(if (empty? ks)
acum
(let [index (get class-values (first ks))
val (f index)]
(recur (conj acum {(first ks) val})
(rest ks))))))
(defn- collect-evaluation-results
"Collects all the statistics from the evaluation of a classifier"
([class-values evaluation]
(do
(println (.toMatrixString evaluation))
(println "=== Summary ===")
(println (.toSummaryString evaluation))
{:correct (try-metric #(.correct evaluation))
:incorrect (try-metric #(.incorrect evaluation))
:unclassified (try-metric #(.unclassified evaluation))
:percentage-correct (try-metric #(.pctCorrect evaluation))
:percentage-incorrect (try-metric #(.pctIncorrect evaluation))
:percentage-unclassified (try-metric #(.pctUnclassified evaluation))
:error-rate (try-metric #(.errorRate evaluation))
:mean-absolute-error (try-metric #(.meanAbsoluteError evaluation))
:relative-absolute-error (try-metric #(.relativeAbsoluteError evaluation))
:root-mean-squared-error (try-metric #(.rootMeanSquaredError evaluation))
:root-relative-squared-error (try-metric #(.rootRelativeSquaredError evaluation))
:correlation-coefficient (try-metric #(.correlationCoefficient evaluation))
:average-cost (try-metric #(.avgCost evaluation))
:kappa (try-metric #(.kappa evaluation))
:kb-information (try-metric #(.KBInformation evaluation))
:kb-mean-information (try-metric #(.KBMeanInformation evaluation))
:kb-relative-information (try-metric #(.KBRelativeInformation evaluation))
:sf-entropy-gain (try-metric #(.SFEntropyGain evaluation))
:sf-mean-entropy-gain (try-metric #(.SFMeanEntropyGain evaluation))
:roc-area (try-multiple-values-metric class-values (fn [i] (try-metric #(.areaUnderROC evaluation i))))
:false-positive-rate (try-multiple-values-metric class-values (fn [i] (try-metric #(.falsePositiveRate evaluation i))))
:false-negative-rate (try-multiple-values-metric class-values (fn [i] (try-metric #(.falseNegativeRate evaluation i))))
:f-measure (try-multiple-values-metric class-values (fn [i] (try-metric #(.fMeasure evaluation i))))
:precision (try-multiple-values-metric class-values (fn [i] (try-metric #(.precision evaluation i))))
:recall (try-multiple-values-metric class-values (fn [i] (try-metric #(.recall evaluation i))))
:evaluation-object evaluation})))
(defmulti classifier-evaluate
"Evaluetes a trained classifier using the provided dataset or cross-validation"
(fn [classifier mode & evaluation-data] mode))
(defmethod classifier-evaluate :dataset
([classifier mode & evaluation-data]
(let [training-data (nth evaluation-data 0)
test-data (nth evaluation-data 1)
evaluation (new Evaluation training-data)
class-values (dataset-class-values training-data)]
(.evaluateModel evaluation classifier test-data (into-array []))
(collect-evaluation-results class-values evaluation))))
(defmethod classifier-evaluate :cross-validation
([classifier mode & evaluation-data]
(let [training-data (nth evaluation-data 0)
folds (nth evaluation-data 1)
evaluation (new Evaluation training-data)
class-values (dataset-class-values training-data)]
(.crossValidateModel evaluation classifier training-data folds (new Random (.getTime (new Date))) (into-array []))
(collect-evaluation-results class-values evaluation))))
;; Classifying instances
(defn classifier-classify
"Classifies an instance or data vector using the provided classifier"
([classifier instance]
(.classifyInstance classifier instance)))
(defn classifier-label
"Classifies and assign a label to a dataset instance"
([classifier instance]
(let [cls (classifier-classify classifier instance)]
(instance-set-class instance cls))))

228
src/clj_ml/data.clj Normal file
View file

@ -0,0 +1,228 @@
;;
;; Manipulation of datasets and instances
;; @author Antonio Garrote
;;
(ns clj-ml.data
(:use [clj-ml utils])
(:import (weka.core Instance Instances FastVector Attribute)
(cljml ClojureInstances)))
;; Construction of individual data and datasets
(defn attribute-name-at- [dataset-or-instance pos]
(let [class-attr (.attribute dataset-or-instance pos)]
(.name class-attr)))
(defn- index-attr [dataset-or-instance attr]
(let [max (.numAttributes dataset-or-instance)
attrs (key-to-str attr)]
(loop [c 0]
(if (= c max)
(throw (.Exception (str "Attribute " attrs " not found")))
(if (= attrs (attribute-name-at- dataset-or-instance c))
c
(recur (+ c 1 )))))))
(defn make-instance
"Creates a new dataset instance from a vector"
([dataset vector]
(make-instance dataset 1 vector))
([dataset weight vector]
(let [inst (new Instance
(count vector))]
(do (.setDataset inst dataset)
(loop [vs vector
c 0]
(if (empty? vs)
(do
(.setWeight inst (double weight))
inst)
(do
(if (or (keyword? (first vs)) (string? (first vs)))
;; this is a nominal entry in keyword or string form
(.setValue inst c (key-to-str (first vs)))
(if (sequential? (first vs))
;; this is a map of values
(let [k (key-to-str (nth (first vs) 0))
val (nth (first vs) 1)
ik (index-attr inst k)]
(if (or (keyword? val) (string? val))
;; this is a nominal entry in keyword or string form
(.setValue inst ik (key-to-str val))
(.setValue inst ik (double val))))
;; A double value for the entry
(.setValue inst c (double (first vs)))))
(recur (rest vs)
(+ c 1)))))))))
(defn- parse-attributes
"Builds a set of attributes for a dataset parsed from the given array"
([attributes]
(loop [atts attributes
fv (new FastVector (count attributes))]
(if (empty? atts)
fv
(do
(let [att (first atts)]
(.addElement fv
(if (map? att)
(if (sequential? (first (vals att)))
(let [v (first (vals att))
vfa (reduce (fn [a i] (.addElement a (key-to-str i)) a)
(new FastVector) v)]
(new Attribute (key-to-str (first (keys att))) vfa))
(new Attribute (key-to-str (first (keys att))) (first (vals att))))
(new Attribute (key-to-str att)))))
(recur (rest atts)
fv))))))
(defn make-dataset
"Creates a new empty dataset. By default the class is set to be the last attribute."
([name attributes capacity-or-values]
(make-dataset name attributes 1 capacity-or-values))
([name attributes weight capacity-or-values]
(let [ds (if (sequential? capacity-or-values)
;; we have received a sequence instead of a number, so we initialize data
;; instances in the dataset
(let [dataset (new ClojureInstances (key-to-str name) (parse-attributes attributes) (count capacity-or-values))]
(loop [vs capacity-or-values]
(if (empty? vs)
dataset
(do
(let [inst (make-instance dataset weight (first vs))]
(.add dataset inst))
(recur (rest vs))))))
;; we haven't received a vector so we create an empty dataset
(new Instances (key-to-str name) (parse-attributes attributes) capacity-or-values))]
;; by default the class is the last attribute in the dataset
(.setClassIndex ds (- (.numAttributes ds) 1))
ds)))
;; dataset information
(defn dataset-class-values [dataset]
(let [class-attr (.classAttribute dataset)
values (.enumerateValues class-attr)]
(loop [continue (.hasMoreElements values)
acum {}]
(if continue
(let [val (.nextElement values)
index (.indexOfValue class-attr val)]
(recur (.hasMoreElements values)
(conj acum {(keyword val) index})))
acum))))
(defn dataset-values-at [dataset-or-instance pos]
(let [class-attr (.attribute dataset-or-instance pos)
values (.enumerateValues class-attr)]
(if (nil? values)
:not-nominal
(loop [continue (.hasMoreElements values)
acum {}]
(if continue
(let [val (.nextElement values)
index (.indexOfValue class-attr val)]
(recur (.hasMoreElements values)
(conj acum {(keyword val) index})))
acum)))))
;; manipulation of instances
(defn instance-set-class [instance pos]
"Sets the index of the class attribute for this instance"
(do (.setClassValue instance pos)
instance))
(defn instance-get-class [instance]
"Get the index of the class attribute for this instance"
(.classValue instance))
(defn instance-value-at [instance pos]
"Returns the value of an instance attribute"
(let [attr (.attribute instance pos)]
(if (.isNominal attr)
(let [val (.value instance pos)
key-vals (dataset-values-at instance pos)
key-val (loop [ks (keys key-vals)]
(if (= (get key-vals (first ks))
val)
(first ks)
(recur (rest ks))))]
key-val)
(.value instance pos))))
(defn instance-to-vector
"Builds a vector with the values of the instance"
[instance]
(let [max (.numValues instance)]
(loop [c 0
acum []]
(if (= c max)
acum
(recur (+ c 1)
(conj acum (instance-value-at instance c)))))))
(defn instance-to-map
"Builds a vector with the values of the instance"
[instance]
(let [max (.numValues instance)]
(loop [c 0
acum {}]
(if (= c max)
acum
(recur (+ c 1)
(conj acum {(keyword (. (.attribute instance c) name))
(instance-value-at instance c)} ))))))
;; manipulation of datasets
(defn dataset-seq [dataset]
"Builds a new clojure sequence from this dataset"
(if (= (class dataset)
ClojureInstances)
(seq dataset)
(seq (new ClojureInstances dataset))))
(defn dataset-set-class [dataset pos]
"Sets the index of the attribute of the dataset that is the class of the dataset"
(do (.setClassIndex dataset pos)
dataset))
(defn dataset-count [dataset]
"Returns the number of elements in a dataset"
(.numInstances dataset))
(defn dataset-add
"Adds a new instance to a dataset. A clojure vector or an Instance
can be passed as arguments"
([dataset vector]
(dataset-add dataset 1 vector))
([dataset weight vector]
(do
(if (= (class vector) weka.core.Instance)
(.add dataset vector)
(let [instance (make-instance dataset weight vector)]
(.add dataset instance)))
dataset)))
(defn dataset-extract-at
"Removes and returns the instance at a certain position from the dataset"
[dataset pos]
(let [inst (.instance dataset pos)]
(do
(.delete dataset pos)
inst)))
(defn dataset-at
"Returns the instance at a certain position from the dataset"
[dataset pos]
(.instance dataset pos))
(defn dataset-pop
"Removes and returns the first instance in the dataset"
[dataset]
(dataset-extract-at dataset 0))

106
src/clj_ml/filters.clj Normal file
View file

@ -0,0 +1,106 @@
;;
;; Data processing of data with different filtering algorithms
;; @author Antonio Garrote
;;
(ns clj-ml.filters
(:use [clj-ml data utils])
(:import (weka.filters Filter)))
;; Options for the filters
(defmulti make-filter-options
"Creates the right parameters for a filter"
(fn [kind map] kind))
(defmethod make-filter-options :supervised-discretize
([kind map]
(let [cols (get map :attributes)
pre-cols (reduce #(str %1 "," (+ %2 1)) "" cols)
cols-val-a ["-R" (.substring pre-cols 1 (.length pre-cols))]
cols-val-b (check-options {:invert "-V"
:binary "-D"
:better-encoding "-E"
:kononenko "-K"}
map
cols-val-a)]
(into-array cols-val-b))))
(defmethod make-filter-options :unsupervised-discretize
([kind map]
(let [cols (get map :attributes)
pre-cols (reduce #(str %1 "," (+ %2 1)) "" cols)
cols-val-a ["-R" (.substring pre-cols 1 (.length pre-cols))]
cols-val-b (check-options {:unset-class "-unset-class-temporarily"
:binary "-D"
:better-encoding "-E"
:equal-frequency "-F"
:optimize "-O"}
map
cols-val-a)
cols-val-c (check-option-values {:number-bins "-B"
:weight-bins "-M"}
map
cols-val-b)]
(into-array cols-val-c))))
(defmethod make-filter-options :supervised-nominal-to-binary
([kind map]
(let [cols-val (check-options {:also-binary "-N"
:for-each-nominal "-A"}
map
[""])]
(into-array cols-val))))
(defmethod make-filter-options :unsupervised-nominal-to-binary
([kind map]
(let [cols (get map :attributes)
pre-cols (reduce #(str %1 "," (+ %2 1)) "" cols)
cols-val-a ["-R" (.substring pre-cols 1 (.length pre-cols))]
cols-val-b (check-options {:invert "-V"
:also-binary "-N"
:for-each-nominal "-A"}
map
cols-val-a)]
(into-array cols-val-b))))
;; Creation of filters
(defmacro make-filter-m [kind options filter-class]
`(let [filter# (new ~filter-class)
dataset# (get ~options :dataset)
opts# (make-filter-options ~kind ~options)]
(.setOptions filter# opts#)
(.setInputFormat filter# dataset#)
filter#))
(defmulti make-filter
"Creates a filter for datasets"
(fn [kind options] kind))
(defmethod make-filter :supervised-discretize
([kind options]
(make-filter-m kind options weka.filters.supervised.attribute.Discretize)))
(defmethod make-filter :unsupervised-discretize
([kind options]
(make-filter-m kind options weka.filters.unsupervised.attribute.Discretize)))
(defmethod make-filter :supervised-nominal-to-binary
([kind options]
(make-filter-m kind options weka.filters.supervised.attribute.NominalToBinary)))
(defmethod make-filter :unsupervised-nominal-to-binary
([kind options]
(make-filter-m kind options weka.filters.unsupervised.attribute.NominalToBinary)))
;; Processing the filtering of data
(defn filter-process
"Filters an input dataset using the provided filter and generates an output dataset"
[filter dataset]
(Filter/useFilter dataset filter))

73
src/clj_ml/io.clj Normal file
View file

@ -0,0 +1,73 @@
;;
;; Storing and reading data from different formats
;; @author Antonio Garrote
;;
(ns clj-ml.io
(:import (weka.core.converters CSVLoader ArffLoader XRFFLoader)
(weka.core.converters CSVSaver ArffSaver XRFFSaver)
(java.io File)
(java.net URL URI)))
;; Loading of instances
(defmulti load-instances
"Load instances from different data sources"
(fn [kind source] kind))
(defmacro m-load-instances [loader source]
`(do
(if (= (class ~source) java.lang.String)
(.setSource ~loader (new URL ~source))
(if (= (class ~source) java.io.File)
(.setFile ~loader ~source)))
(.getDataSet ~loader)))
(defmethod load-instances :arff
([kind source]
(let [loader (new ArffLoader)]
(m-load-instances loader source))))
(defmethod load-instances :xrff
([kind source]
(let [loader (new XRFFLoader)]
(m-load-instances loader source))))
(defmethod load-instances :csv
([kind source]
(let [loader (new CSVLoader)]
(m-load-instances loader source))))
;; Saving of instances
(defmulti save-instances
"Save instances into data destinies"
(fn [kind destiny instances] kind))
(defmacro m-save-instances [saver destiny instances]
`(do
(if (= (class ~destiny) java.lang.String)
(.setFile ~saver (new File (new URI ~destiny)))
(if (= (class ~destiny) java.io.File)
(.setFile ~saver ~destiny)))
(.setInstances ~saver ~instances)
(.writeBatch ~saver)))
(defmethod save-instances :arff
([kind destiny instances]
(let [saver (new ArffSaver)]
(m-save-instances saver destiny instances))))
(defmethod save-instances :xrff
([kind destiny instances]
(let [saver (new XRFFSaver)]
(m-save-instances saver destiny instances))))
(defmethod save-instances :csv
([kind destiny instances]
(let [saver (new CSVSaver)]
(m-save-instances saver destiny instances))))

97
src/clj_ml/utils.clj Normal file
View file

@ -0,0 +1,97 @@
;;
;; Common utilities and functions
;; @author Antonio Garrote
;;
(ns clj-ml.utils
(:import (java.io ObjectOutputStream ByteArrayOutputStream
ByteArrayInputStream ObjectInputStream
FileOutputStream FileInputStream)))
(defn key-to-str
"transforms a keyword into a string"
([k]
(if (= (class k) String)
k
(let [sk (str k)]
(.substring sk 1)))))
;; Manipulation of array of options
(defn check-option [opts val flag map]
"Sets an option for a filter"
(let [val-in-map (get map val)]
(if (nil? val-in-map)
opts
(conj opts flag))))
(defn check-option-value [opts val flag map]
"Sets an option with value for a filter"
(let [val-in-map (get map val)]
(if (nil? val-in-map)
opts
(conj (conj opts flag) (str val-in-map)))))
(defn check-options [opts-map args-map tmp]
"Checks the presence of a set of options for a filter"
(loop [rem (keys opts-map)
acum tmp]
(if (empty? rem)
acum
(let [k (first rem)
vk (get opts-map k)
rst (rest rem)]
(recur rst
(check-option acum k vk args-map))))))
(defn check-option-values [opts-map args-map tmp]
"Checks the presence of a set of options with value for a filter"
(loop [rem (keys opts-map)
acum tmp]
(if (empty? rem)
acum
(let [k (first rem)
vk (get opts-map k)
rst (rest rem)]
(recur rst
(check-option-value acum k vk args-map))))))
;; Serializing classifiers
(defn serialize
"Writes an object to memory"
([obj]
(let [bs (new ByteArrayOutputStream)
os (new ObjectOutputStream bs)]
(.writeObject os obj)
(.close os)
(.toByteArray bs))))
(defn deserialize
"Reads an object from memory"
([bytes]
(let [bs (new ByteArrayInputStream bytes)
is (new ObjectInputStream bs)
obj (.readObject is)]
(.close is)
obj)))
(defn serialize-to-file
"Writes an object to a file"
([obj path]
(let [fs (new FileOutputStream path)
os (new ObjectOutputStream fs)]
(.writeObject os obj)
(.close os))
path))
(defn deserialize-from-file
"Reads an object from a file"
([path]
(let [fs (new FileInputStream path)
is (new ObjectInputStream fs)
obj (.readObject is)]
(.close is)
obj)))

View file

@ -0,0 +1,64 @@
package cljml;
import clojure.lang.ISeq;
import java.io.IOException;
import java.util.Iterator;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
/**
* A wrapper around Weka's Instances class to add some Clojure behavior.
*
* @author Antonio Garrote
*/
class ClojureInstancesIterator implements Iterator<Instance> {
private Instances instances;
private int counter;
public ClojureInstancesIterator(Instances insts) {
this.instances = insts;
this.counter = 0;
}
public boolean hasNext() {
return counter < instances.numInstances();
}
public Instance next() {
Instance next = instances.instance(counter);
counter++;
return next;
}
public void remove() {
instances.delete(counter - 1);
}
}
public class ClojureInstances extends weka.core.Instances implements Iterable<weka.core.Instance>{
public ClojureInstances(Instances dataset) {
super(dataset);
}
public ClojureInstances(Instances dataset, int capacity) {
super(dataset,capacity);
}
public ClojureInstances(Instances source, int first, int toCopy) {
super(source, first, toCopy);
}
public ClojureInstances(java.io.Reader reader) throws IOException {
super(reader);
}
public ClojureInstances(java.lang.String name, FastVector attInfo, int capacity) {
super(name, attInfo, capacity);
}
public Iterator<Instance> iterator() {
return new ClojureInstancesIterator(this);
}
}

View file

@ -0,0 +1,66 @@
(ns clj-ml.classifiers-test
(:use [clj-ml classifiers data] :reload-all)
(:use [clojure.test]))
(deftest make-classifiers-options-c45
(let [options (make-classifier-options :decission-tree :c45 {:unpruned true :reduced-error-pruning true :only-binary-splits true :no-raising true
:no-cleanup true :laplace-smoothing true :pruning-confidence 0.12 :minimum-instances 10
:pruning-number-folds 5 :shuffling-random-seed 1})]
(is (= (aget options 0)
""))
(is (= (aget options 1)
"-U"))
(is (= (aget options 2)
"-R"))
(is (= (aget options 3)
"-B"))
(is (= (aget options 4)
"-S"))
(is (= (aget options 5)
"-L"))
(is (= (aget options 6)
"-A"))
(is (= (aget options 7)
"-C"))
(is (= (aget options 8)
"0.12"))
(is (= (aget options 9)
"-M"))
(is (= (aget options 10)
"10"))
(is (= (aget options 11)
"-N"))
(is (= (aget options 12)
"5"))
(is (= (aget options 13)
"-Q"))
(is (= (aget options 14)
"1"))))
(deftest make-classifier-c45
(let [c (make-classifier :decission-tree :c45)]
(is (= (class c)
weka.classifiers.trees.J48))))
(deftest train-classifier-c45
(let [c (make-classifier :decission-tree :c45)
ds (clj-ml.data/make-dataset "test" [:a :b {:c [:m :n]}] [[1 2 :m] [4 5 :m]])]
(classifier-train c ds)
(is true)))
(deftest classifier-evaluate-dataset
(let [c (make-classifier :decission-tree :c45)
ds (clj-ml.data/make-dataset "test" [:a :b {:c [:m :n]}] [[1 2 :m] [4 5 :m]])
tds (clj-ml.data/make-dataset "test" [:a :b {:c [:m :n]}] [[4 1 :n] [4 5 :m]])
foo (classifier-train c ds)
res (classifier-evaluate c :dataset ds tds)]
(is (= 26 (count (keys res))))))
(deftest classifier-evaluate-cross-validation
(let [c (make-classifier :decission-tree :c45)
ds (clj-ml.data/make-dataset "test" [:a :b {:c [:m :n]}] [[1 2 :m] [4 5 :m]])
foo (classifier-train c ds)
res (classifier-evaluate c :cross-validation ds 2)]
(is (= 26 (count (keys res))))))

105
test/clj_ml/data_test.clj Normal file
View file

@ -0,0 +1,105 @@
(ns clj-ml.data-test
(:use [clj-ml.data] :reload-all)
(:use [clojure.test]))
(deftest make-instance-num
(let [dataset (make-dataset :test
[:a :b]
1)
inst (make-instance dataset [1 2])]
(is (= (class inst)
weka.core.Instance))
(is (= 2 (.numValues inst)))
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))))
(deftest make-instance-ord
(let [dataset (make-dataset :test
[:a {:b [:b1 :b2]}]
1)
inst (make-instance dataset [1 :b1])]
(is (= (class inst)
weka.core.Instance))
(is (= 2 (.numValues inst)))
(is (= 1.0 (.value inst 0)))
(is (= "b1" (.stringValue inst 1)))))
(deftest dataset-default-class
(let [dataset (make-dataset :test
[:a :b]
2)]
(is (= 1 (.classIndex dataset)))))
(deftest dataset-change-class
(let [dataset (make-dataset :test
[:a :b]
2)]
(is (= 1 (.classIndex dataset)))
(is (= 0 (.classIndex (dataset-set-class dataset 0))))))
(deftest dataset-count-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(is (= 1 (dataset-count dataset)))))
(deftest dataset-add-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1))))))
(deftest dataset-add-2
(let [dataset (make-dataset :test
[:a :b]
2)
instance (make-instance dataset [1 2])]
(dataset-add dataset instance)
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1))))))
(deftest dataset-extract-at-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))
(let [inst-ext (dataset-extract-at dataset 0)]
(is (= 0 (.numInstances dataset)))
(is (= 1.0 (.value inst-ext 0)))
(is (= 2.0 (.value inst-ext 1)))))))
(deftest dataset-pop-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))
(let [inst-ext (dataset-pop dataset)]
(is (= 0 (.numInstances dataset)))
(is (= 1.0 (.value inst-ext 0)))
(is (= 2.0 (.value inst-ext 1)))))))
(deftest dataset-seq-1
(let [dataset (make-dataset :test [:a :b {:c [:e :f]}] [[1 2 :e] [3 4 :f]])
seq (dataset-seq dataset)]
(is (sequential? seq))))
(deftest working-sequences
(let [ds (make-dataset "test" [:a :b {:c [:d :e]}] [{:a 1 :b 2 :c :d} [4 5 :e]])]
(is (= 2 (dataset-count ds)))
(let [dsm (map #(instance-to-map %1) (dataset-seq ds))]
(is (= 2 (count dsm)))
(is (= 1.0 (:a (first dsm))))
(let [dsb (make-dataset "test" [:a :b {:c [:d :e]}] dsm)]
(is (= 2 (dataset-count dsb)))))))

View file

@ -0,0 +1,101 @@
(ns clj-ml.filters-test
(:use [clj-ml.filters] :reload-all)
(:use [clojure.test]))
(deftest make-filter-options-supervised-discretize
(let [options (make-filter-options :supervised-discretize {:attributes [1 2] :invert true :binary true :better-encoding true :kononenko true :nonexitent true})]
(is (= (aget options 0)
"-R"))
(is (= (aget options 1)
"2,3"))
(is (= (aget options 2)
"-V"))
(is (= (aget options 3)
"-D"))
(is (= (aget options 4)
"-E"))
(is (= (aget options 5)
"-K"))))
(deftest make-filter-options-unsupervised-discretize
(let [options (make-filter-options :unsupervised-discretize {:attributes [1 2] :binary true :better-encoding true
:better-encoding true :equal-frequency true :optimize true
:number-bins 4 :weight-bins 1})]
(is (= (aget options 0)
"-R"))
(is (= (aget options 1)
"2,3"))
(is (= (aget options 2)
"-D"))
(is (= (aget options 3)
"-E"))
(is (= (aget options 4)
"-F"))
(is (= (aget options 5)
"-O"))
(is (= (aget options 6)
"-B"))
(is (= (aget options 7)
"4"))
(is (= (aget options 8)
"-M"))
(is (= (aget options 9)
"1"))))
(deftest make-filter-options-supervised-nominal-to-binary
(let [options (make-filter-options :supervised-nominal-to-binary {:also-binary true :for-each-nominal true})]
(is (= (aget options 0)
""))
(is (= (aget options 1)
"-N"))
(is (= (aget options 2)
"-A"))))
(deftest make-filter-options-unsupervised-nominal-to-binary
(let [options (make-filter-options :unsupervised-nominal-to-binary {:attributes [1,2] :also-binary true :for-each-nominal true :invert true})]
(is (= (aget options 0)
"-R"))
(is (= (aget options 1)
"2,3"))
(is (= (aget options 2)
"-V"))
(is (= (aget options 3)
"-N"))
(is (= (aget options 4)
"-A"))))
(deftest make-filter-discretize-sup
(let [ds (clj-ml.data/make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 5 :g]])
f (make-filter :supervised-discretize {:dataset ds :attributes [0]})]
(is (= weka.filters.supervised.attribute.Discretize
(class f)))))
(deftest make-filter-discretize-unsup
(let [ds (clj-ml.data/make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 5 :g]])
f (make-filter :unsupervised-discretize {:dataset ds :attributes [0]})]
(is (= weka.filters.unsupervised.attribute.Discretize
(class f)))))
(deftest make-filter-nominal-to-binary-sup
(let [ds (clj-ml.data/make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 5 :g]])
f (make-filter :supervised-nominal-to-binary {:dataset ds})]
(is (= weka.filters.supervised.attribute.NominalToBinary
(class f)))))
(deftest make-filter-nominal-to-binary-unsup
(let [ds (clj-ml.data/make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 5 :g]])
f (make-filter :unsupervised-nominal-to-binary {:dataset ds :attributes [2]})]
(is (= weka.filters.unsupervised.attribute.NominalToBinary
(class f)))))