diff --git a/README.md b/README.md index de2b3c7..f216a9d 100644 --- a/README.md +++ b/README.md @@ -96,10 +96,15 @@ API documenation can be found [here](http://antoniogarrote.github.com/clj-ml/ind REPL>(def ds (load-instances :arff "file:///Applications/weka-3-6-2/data/iris.arff")) REPL>; Discretizing a numeric attribute using an unsupervised filter - REPL>(def discretize (make-filter :unsupervised-discretize {:dataset-format ds :attributes [0 2]})) + REPL>(def discretize (make-filter :unsupervised-discretize {:dataset-format ds :attributes [:sepallength :petallength]})) REPL>(def filtered-ds (filter-apply discretize ds)) + + REPL>; You can also use the filter's fn directly which will create and apply the filter: + REPL>(def filtered-ds (unsupervised-discretize ds {:attributes [:sepallength :petallength]})) + REPL>; The above way lends itself to the -> macro and is useful when using multiple filters. + REPL>; The eqivalent operation can be done with the ->> macro and make-apply-filter fn: REPL>(def filtered-ds (->> "file:///Applications/weka-3-6-2/data/iris.arff") diff --git a/src/clj_ml/filters.clj b/src/clj_ml/filters.clj index c6dcba4..d8a1d4d 100644 --- a/src/clj_ml/filters.clj +++ b/src/clj_ml/filters.clj @@ -9,21 +9,33 @@ dataset in some way: transforming nominal attributes into binary attributes, removing attributes etc. - A sample use of the API is shown below: + There are a number of ways to use the filtering API. The most straight forward and + idomatic clojure way is to use the provided filter fns: - ;; *ds* is the dataset where the first attribute is to be removed - (def *filter* (make-filter :remove-attributes {:dataset-format *ds* :attributes [:name-of-attr]})) + ;; ds is the dataset + (def ds (make-dataset :test [:a :b {:c [:g :m]}] + [ [1 2 :g] + [2 3 :m] + [4 5 :g]])) + (def filtered-ds + (-> ds + (add-attribute {:type :nominal, :column 1, :name \"pet\", :labels [\"dog\" \"cat\"]}) + (remove-attributes {:attributes [:a :c]}))) + + + The above functions rely on lower level fns that create and apply the filters which you may + also use if you need more control over the actual filter objects: + + (def filter (make-filter :remove-attributes {:dataset-format ds :attributes [:a :c]})) ;; We apply the filter to the original data set and obtain the new one - (def *filtered-ds* (filter-apply *filter* *ds*)) + (def filtered-ds (filter-apply filter ds)) The previous sample of code could be rewritten with the make-apply-filter function: - ;; There is no necessity of passing the :dataset-format option, *ds* format is used - ;; automatically - (def *filtered-ds* (make-apply-filter :remove-attributes {:attributes [0]} *ds*))" + (def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))" (:use [clj-ml data utils] [clojure.contrib [def :only [defvar defvar-]]]) (:require [clojure.contrib [string :as str]]) @@ -45,6 +57,20 @@ (for [attr (:attributes m)] (inc (index-attr (:dataset-format m) attr))))]) +(declare make-apply-filter) +;TODO: consider passing in the make-filter-options body here as well in additon to the docstring. +(defmacro deffilter + "Defines the filter's fn that creates a fn to make and apply the filter." + [filter-name] + (let [filter-keyword (keyword filter-name)] + `(do + (defn ~filter-name + ([ds#] + (make-apply-filter ~filter-keyword {} ds#)) + ([ds# attributes#] + (make-apply-filter ~filter-keyword attributes# ds#)))))) + + (defmethod make-filter-options :supervised-discretize ([kind m] (->> (extract-attributes m) @@ -53,6 +79,8 @@ :better-encoding "-E" :kononenko "-K"})))) +(deffilter supervised-discretize) + (defmethod make-filter-options :unsupervised-discretize ([kind m] (->> (extract-attributes m) @@ -64,10 +92,15 @@ (check-option-values m {:number-bins "-B" :weight-bins "-M"})))) +(deffilter unsupervised-discretize) + (defmethod make-filter-options :supervised-nominal-to-binary ([kind m] (check-options m {:also-binary "-N" :for-each-nominal "-A"}))) + +(deffilter unsupervised-discretize) + (defmethod make-filter-options :unsupervised-nominal-to-binary ([kind m] (->> (extract-attributes m) @@ -75,14 +108,14 @@ :also-binary "-N" :for-each-nominal "-A"})))) +(deffilter unsupervised-nominal-to-binary) + (defmethod make-filter-options :numeric-to-nominal ([kind m] (->> (extract-attributes m) (check-options m {:invert "-V"})))) -(defmethod make-filter-options :add-attribute - ([kind m] - (->> (extract-attributes m) - (check-options m {:invert "-V"})))) +(deffilter numeric-to-nominal) + (defvar- attribute-types {:numeric "NUM" :nominal "NOM" :string "STR" :date "DAT"} "Mapping of Weka's attribute types from clj-ml keywords to the -T flag's representation.") @@ -99,20 +132,28 @@ :column "-C" :date-format "-F"})))) +(deffilter add-attribute) + (defmethod make-filter-options :remove-attributes ([kind m] (->> (extract-attributes m) (check-options m {:invert "-V"})))) +(deffilter remove-attributes) + (defmethod make-filter-options :remove-useless-attributes ([kind m] (check-option-values m {:max-variance "-M"}))) +(deffilter remove-useless-attributes) + (defmethod make-filter-options :select-append-attributes ([kind m] (->> (extract-attributes m) (check-options m {:invert "-V"})))) +(deffilter select-append-attributes) + (defmethod make-filter-options :project-attributes ([kind options] (let [opts (if (nil? (:invert options)) @@ -120,6 +161,10 @@ (dissoc options :invert))] (make-filter-options :remove-attributes opts)))) +(deffilter project-attributes) + +(deffilter clj-streamable) +(deffilter clj-batch) ;; Creation of filters diff --git a/test/clj_ml/filters_test.clj b/test/clj_ml/filters_test.clj index 561b7c4..eb2c86e 100644 --- a/test/clj_ml/filters_test.clj +++ b/test/clj_ml/filters_test.clj @@ -130,7 +130,7 @@ [ [1 2 :g] [2 3 :m] [4 5 :g]]) - res (make-apply-filter :add-attribute {:type :nominal, :column 1, :name "pet", :labels ["dog" "cat"]} ds)] + res (add-attribute ds {:type :nominal, :column 1, :name "pet", :labels ["dog" "cat"]})] (is (= (dataset-format res) [:a {:pet '(:cat :dog)} :b {:c '(:m :g)}])))) @@ -145,6 +145,17 @@ (is (= (dataset-format res) [{:pet '(:cat :dog)} :b])))) +(deftest using-regular-filter-fns-with-threading + (let [ds (make-dataset :test [:a :b {:c [:g :m]}] + [ [1 2 :g] + [2 3 :m] + [4 5 :g]]) + res (-> ds + (add-attribute {:type :nominal, :column 1, :name "pet", :labels ["dog" "cat"]}) + (remove-attributes {:attributes [:a :c]}))] + (is (= (dataset-format res) + [{:pet '(:cat :dog)} :b])))) + (deftest make-apply-filter-clj-streamable (let [ds (make-dataset :test [:a :b {:c [:g :m]}] [ [1 2 :g] @@ -189,9 +200,9 @@ (#(weka.core.Instance. 1 (into-array Double/TYPE %))) add-instance)) result)) - res (make-apply-filter :clj-batch - {:process add-max-diff-values - :determine-dataset-format add-max-diff-attr} ds)] + res (clj-batch ds + {:process add-max-diff-values + :determine-dataset-format add-max-diff-attr})] (is (= [{:a 1 :max-diff 3} {:a 2 :max-diff 2} {:a 4 :max-diff 0}]