diff --git a/src/clj_ml/data.clj b/src/clj_ml/data.clj index c76c0a4..5bb2d20 100644 --- a/src/clj_ml/data.clj +++ b/src/clj_ml/data.clj @@ -12,6 +12,7 @@ that can be transformed using usual Clojure functions like map, reduce, etc." (:use [clj-ml utils] [clojure.contrib.seq :only [find-first]]) + (:require [clj-ml.filters :as filters]) (:import (weka.core Instance Instances FastVector Attribute) (cljml ClojureInstances))) @@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da (doto dataset (.deleteAttributeAt (int attr-pos)) (.insertAttributeAt new-attr (int attr-pos))))) + +(defn split-dataset + "Splits the dataset into two parts based on the percentage given. +The first dataset returned will have 'percentage ammount of the original dataset and the second has the +remaining portion. Both datasets are Delay objects that need to be dereffed. If you want to have the +split immediately you can use do-split-dataset." + [ds percentage] + [(delay (filters/remove-percentage ds {:percentage percentage :invert true})) + (delay (filters/remove-percentage ds {:percentage percentage}))]) + +(defn do-split-dataset + "Splits the dataset into two parts based on the percentage given. The same as split-dataset but +actual datasets are returned and not Delay objects that need dereffing." + [ds percentage] + (map deref (split-dataset ds percentage))) diff --git a/src/clj_ml/filters.clj b/src/clj_ml/filters.clj index 21371b8..70eea9e 100644 --- a/src/clj_ml/filters.clj +++ b/src/clj_ml/filters.clj @@ -36,7 +36,7 @@ The previous sample of code could be rewritten with the make-apply-filter function: (def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))" - (:use [clj-ml data utils options-utils] + (:use [clj-ml utils options-utils] [clojure.contrib [def :only [defvar defvar-]]]) (:require [clojure.contrib [string :as str]]) (:import (weka.filters Filter) @@ -135,6 +135,13 @@ (deffilter remove-attributes) +(defmethod make-filter-options :remove-percentage + ([kind m] + (->> (check-option-values m {:percentage "-P"}) + (check-options m {:invert "-V"})))) + +(deffilter remove-percentage) + (defmethod make-filter-options :remove-useless-attributes ([kind m] (check-option-values m {:max-variance "-M"}))) @@ -170,6 +177,7 @@ :numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal :add-attribute weka.filters.unsupervised.attribute.Add :remove-attributes weka.filters.unsupervised.attribute.Remove + :remove-percentage weka.filters.unsupervised.instance.RemovePercentage :remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless :select-append-attributes weka.filters.unsupervised.attribute.Copy :project-attributes weka.filters.unsupervised.attribute.Remove} @@ -188,6 +196,7 @@ - :numeric-to-nominal - :add-attribute - :remove-attributes + - :remove-percentage - :remove-useless-attributes - :select-append-attributes - :project-attributes diff --git a/src/clj_ml/options_utils.clj b/src/clj_ml/options_utils.clj index 3ff43de..bb1e405 100644 --- a/src/clj_ml/options_utils.clj +++ b/src/clj_ml/options_utils.clj @@ -6,7 +6,7 @@ (ns #^{:author "Ben Mabey " :skip-wiki true} clj-ml.options-utils - (:use [clj-ml data]) + (:use [clojure.contrib.seq :only [find-first]]) (:require [clojure.contrib [string :as str]])) ;; Manipulation of array of options @@ -26,6 +26,18 @@ (conj (conj opts flag) (str val-in-map))))) +;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability +;; to handle circular dependencies. :( +(defn- attr-name [^weka.core.Attribute attr] + (.name attr)) + +(defn- dataset-index-attr + "Returns the index of an attribute in the attributes definition of a dataset." + [^weka.core.Instances dataset attr] + (if (number? attr) + attr + (find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset))))) + (defn extract-attributes "Transforms the :attributes value from m into the appropriate weka flag" ([m] (extract-attributes "-R" m)) diff --git a/test/clj_ml/data_test.clj b/test/clj_ml/data_test.clj index 00007bd..d3d4d70 100644 --- a/test/clj_ml/data_test.clj +++ b/test/clj_ml/data_test.clj @@ -180,3 +180,23 @@ (is (= nil (dataset-class-name ds))) (dataset-set-class ds :b) (is (= :b (dataset-class-name ds))))) + +(deftest split-dataset-test + (let [ds (make-dataset "test" [:a {:b [:foo :bar]}] + [[1 :foo] + [2 :bar] + [3 :bar] + [4 :foo]]) + [a b] (split-dataset ds 25)] + (is (= (dataset-count @a) 1)) + (is (= (dataset-count @b) 3)))) + +(deftest do-split-dataset-test + (let [ds (make-dataset "test" [:a {:b [:foo :bar]}] + [[1 :foo] + [2 :bar] + [3 :bar] + [4 :foo]]) + [a b] (do-split-dataset ds 25)] + (is (= (dataset-count a) 1)) + (is (= (dataset-count b) 3)))) diff --git a/test/clj_ml/filters_test.clj b/test/clj_ml/filters_test.clj index f38ae42..09c4bb4 100644 --- a/test/clj_ml/filters_test.clj +++ b/test/clj_ml/filters_test.clj @@ -107,6 +107,13 @@ (is (= (dataset-format res) [:b {:c '(:g :m)}])))) +(deftest remove-precentage-test + (let [ds (make-dataset :test [:a :b {:c [:g :m]}] + [ [1 2 :g] + [2 3 :m] + [4 2 :m] + [4 5 :g]])] + (is (= (dataset-count (remove-percentage ds {:percentage 75})) 1)))) (deftest make-apply-filter-numeric-to-nominal (let [ds (make-dataset :test [:a :b {:c [:g :m]}]