adds split-dataset via new remove-percentage filter wrapper

This commit is contained in:
Ben Mabey 2011-12-07 10:40:41 -07:00
parent b1130cf80d
commit 0c32c318af
5 changed files with 66 additions and 2 deletions

View file

@ -12,6 +12,7 @@
that can be transformed using usual Clojure functions like map, reduce, etc."
(:use [clj-ml utils]
[clojure.contrib.seq :only [find-first]])
(:require [clj-ml.filters :as filters])
(:import (weka.core Instance Instances FastVector Attribute)
(cljml ClojureInstances)))
@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
(doto dataset
(.deleteAttributeAt (int attr-pos))
(.insertAttributeAt new-attr (int attr-pos)))))
(defn split-dataset
"Splits the dataset into two parts based on the percentage given.
The first dataset returned will have 'percentage ammount of the original dataset and the second has the
remaining portion. Both datasets are Delay objects that need to be dereffed. If you want to have the
split immediately you can use do-split-dataset."
[ds percentage]
[(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
(delay (filters/remove-percentage ds {:percentage percentage}))])
(defn do-split-dataset
"Splits the dataset into two parts based on the percentage given. The same as split-dataset but
actual datasets are returned and not Delay objects that need dereffing."
[ds percentage]
(map deref (split-dataset ds percentage)))

View file

@ -36,7 +36,7 @@
The previous sample of code could be rewritten with the make-apply-filter function:
(def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
(:use [clj-ml data utils options-utils]
(:use [clj-ml utils options-utils]
[clojure.contrib [def :only [defvar defvar-]]])
(:require [clojure.contrib [string :as str]])
(:import (weka.filters Filter)
@ -135,6 +135,13 @@
(deffilter remove-attributes)
(defmethod make-filter-options :remove-percentage
([kind m]
(->> (check-option-values m {:percentage "-P"})
(check-options m {:invert "-V"}))))
(deffilter remove-percentage)
(defmethod make-filter-options :remove-useless-attributes
([kind m]
(check-option-values m {:max-variance "-M"})))
@ -170,6 +177,7 @@
:numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
:add-attribute weka.filters.unsupervised.attribute.Add
:remove-attributes weka.filters.unsupervised.attribute.Remove
:remove-percentage weka.filters.unsupervised.instance.RemovePercentage
:remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
:select-append-attributes weka.filters.unsupervised.attribute.Copy
:project-attributes weka.filters.unsupervised.attribute.Remove}
@ -188,6 +196,7 @@
- :numeric-to-nominal
- :add-attribute
- :remove-attributes
- :remove-percentage
- :remove-useless-attributes
- :select-append-attributes
- :project-attributes

View file

@ -6,7 +6,7 @@
(ns #^{:author "Ben Mabey <ben@benmabey.com>"
:skip-wiki true}
clj-ml.options-utils
(:use [clj-ml data])
(:use [clojure.contrib.seq :only [find-first]])
(:require [clojure.contrib [string :as str]]))
;; Manipulation of array of options
@ -26,6 +26,18 @@
(conj (conj opts flag) (str val-in-map)))))
;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
;; to handle circular dependencies. :(
(defn- attr-name [^weka.core.Attribute attr]
(.name attr))
(defn- dataset-index-attr
"Returns the index of an attribute in the attributes definition of a dataset."
[^weka.core.Instances dataset attr]
(if (number? attr)
attr
(find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
(defn extract-attributes
"Transforms the :attributes value from m into the appropriate weka flag"
([m] (extract-attributes "-R" m))

View file

@ -180,3 +180,23 @@
(is (= nil (dataset-class-name ds)))
(dataset-set-class ds :b)
(is (= :b (dataset-class-name ds)))))
(deftest split-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (split-dataset ds 25)]
(is (= (dataset-count @a) 1))
(is (= (dataset-count @b) 3))))
(deftest do-split-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (do-split-dataset ds 25)]
(is (= (dataset-count a) 1))
(is (= (dataset-count b) 3))))

View file

@ -107,6 +107,13 @@
(is (= (dataset-format res)
[:b {:c '(:g :m)}]))))
(deftest remove-precentage-test
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 2 :m]
[4 5 :g]])]
(is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))
(deftest make-apply-filter-numeric-to-nominal
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]