adds split-dataset via new remove-percentage filter wrapper

This commit is contained in:
Ben Mabey 2011-12-07 10:40:41 -07:00
parent b1130cf80d
commit 0c32c318af
5 changed files with 66 additions and 2 deletions

View file

@ -12,6 +12,7 @@
that can be transformed using usual Clojure functions like map, reduce, etc." that can be transformed using usual Clojure functions like map, reduce, etc."
(:use [clj-ml utils] (:use [clj-ml utils]
[clojure.contrib.seq :only [find-first]]) [clojure.contrib.seq :only [find-first]])
(:require [clj-ml.filters :as filters])
(:import (weka.core Instance Instances FastVector Attribute) (:import (weka.core Instance Instances FastVector Attribute)
(cljml ClojureInstances))) (cljml ClojureInstances)))
@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
(doto dataset (doto dataset
(.deleteAttributeAt (int attr-pos)) (.deleteAttributeAt (int attr-pos))
(.insertAttributeAt new-attr (int attr-pos))))) (.insertAttributeAt new-attr (int attr-pos)))))
(defn split-dataset
"Splits the dataset into two parts based on the percentage given.
The first dataset returned will have 'percentage ammount of the original dataset and the second has the
remaining portion. Both datasets are Delay objects that need to be dereffed. If you want to have the
split immediately you can use do-split-dataset."
[ds percentage]
[(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
(delay (filters/remove-percentage ds {:percentage percentage}))])
(defn do-split-dataset
"Splits the dataset into two parts based on the percentage given. The same as split-dataset but
actual datasets are returned and not Delay objects that need dereffing."
[ds percentage]
(map deref (split-dataset ds percentage)))

View file

@ -36,7 +36,7 @@
The previous sample of code could be rewritten with the make-apply-filter function: The previous sample of code could be rewritten with the make-apply-filter function:
(def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))" (def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
(:use [clj-ml data utils options-utils] (:use [clj-ml utils options-utils]
[clojure.contrib [def :only [defvar defvar-]]]) [clojure.contrib [def :only [defvar defvar-]]])
(:require [clojure.contrib [string :as str]]) (:require [clojure.contrib [string :as str]])
(:import (weka.filters Filter) (:import (weka.filters Filter)
@ -135,6 +135,13 @@
(deffilter remove-attributes) (deffilter remove-attributes)
(defmethod make-filter-options :remove-percentage
([kind m]
(->> (check-option-values m {:percentage "-P"})
(check-options m {:invert "-V"}))))
(deffilter remove-percentage)
(defmethod make-filter-options :remove-useless-attributes (defmethod make-filter-options :remove-useless-attributes
([kind m] ([kind m]
(check-option-values m {:max-variance "-M"}))) (check-option-values m {:max-variance "-M"})))
@ -170,6 +177,7 @@
:numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal :numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
:add-attribute weka.filters.unsupervised.attribute.Add :add-attribute weka.filters.unsupervised.attribute.Add
:remove-attributes weka.filters.unsupervised.attribute.Remove :remove-attributes weka.filters.unsupervised.attribute.Remove
:remove-percentage weka.filters.unsupervised.instance.RemovePercentage
:remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless :remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
:select-append-attributes weka.filters.unsupervised.attribute.Copy :select-append-attributes weka.filters.unsupervised.attribute.Copy
:project-attributes weka.filters.unsupervised.attribute.Remove} :project-attributes weka.filters.unsupervised.attribute.Remove}
@ -188,6 +196,7 @@
- :numeric-to-nominal - :numeric-to-nominal
- :add-attribute - :add-attribute
- :remove-attributes - :remove-attributes
- :remove-percentage
- :remove-useless-attributes - :remove-useless-attributes
- :select-append-attributes - :select-append-attributes
- :project-attributes - :project-attributes

View file

@ -6,7 +6,7 @@
(ns #^{:author "Ben Mabey <ben@benmabey.com>" (ns #^{:author "Ben Mabey <ben@benmabey.com>"
:skip-wiki true} :skip-wiki true}
clj-ml.options-utils clj-ml.options-utils
(:use [clj-ml data]) (:use [clojure.contrib.seq :only [find-first]])
(:require [clojure.contrib [string :as str]])) (:require [clojure.contrib [string :as str]]))
;; Manipulation of array of options ;; Manipulation of array of options
@ -26,6 +26,18 @@
(conj (conj opts flag) (str val-in-map))))) (conj (conj opts flag) (str val-in-map)))))
;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
;; to handle circular dependencies. :(
(defn- attr-name [^weka.core.Attribute attr]
(.name attr))
(defn- dataset-index-attr
"Returns the index of an attribute in the attributes definition of a dataset."
[^weka.core.Instances dataset attr]
(if (number? attr)
attr
(find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
(defn extract-attributes (defn extract-attributes
"Transforms the :attributes value from m into the appropriate weka flag" "Transforms the :attributes value from m into the appropriate weka flag"
([m] (extract-attributes "-R" m)) ([m] (extract-attributes "-R" m))

View file

@ -180,3 +180,23 @@
(is (= nil (dataset-class-name ds))) (is (= nil (dataset-class-name ds)))
(dataset-set-class ds :b) (dataset-set-class ds :b)
(is (= :b (dataset-class-name ds))))) (is (= :b (dataset-class-name ds)))))
(deftest split-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (split-dataset ds 25)]
(is (= (dataset-count @a) 1))
(is (= (dataset-count @b) 3))))
(deftest do-split-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (do-split-dataset ds 25)]
(is (= (dataset-count a) 1))
(is (= (dataset-count b) 3))))

View file

@ -107,6 +107,13 @@
(is (= (dataset-format res) (is (= (dataset-format res)
[:b {:c '(:g :m)}])))) [:b {:c '(:g :m)}]))))
(deftest remove-precentage-test
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
[ [1 2 :g]
[2 3 :m]
[4 2 :m]
[4 5 :g]])]
(is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))
(deftest make-apply-filter-numeric-to-nominal (deftest make-apply-filter-numeric-to-nominal
(let [ds (make-dataset :test [:a :b {:c [:g :m]}] (let [ds (make-dataset :test [:a :b {:c [:g :m]}]