diff --git a/README.md b/README.md index c891b56..4ab05dc 100644 --- a/README.md +++ b/README.md @@ -307,12 +307,14 @@ user> (classifier-classify classifier2 instance) Text document handling: ```clojure -user> (def docs [{:title "Document title 1" +user> (def docs [{:id 10 + :title "Document title 1" :fulltext "This is the fulltext..." - :has-term? false} - {:title "Another document title" + :has-class? false} + {:id 11 + :title "Another document title" :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas" - :has-term? true}]) + :has-class? true}]) #'user/docs user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false) diff --git a/src/clj_ml/data.clj b/src/clj_ml/data.clj index 6deb6ef..f701e66 100644 --- a/src/clj_ml/data.clj +++ b/src/clj_ml/data.clj @@ -535,7 +535,7 @@ split immediately you can use do-split-dataset." [docs model-prefix model-dir & opts] (let [parsed-opts (apply hash-map opts) original-ordering (map :id docs) - docs-with-class (filter :has-class? doc) + docs-with-class (filter :has-class? docs) docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)] (if (:resample parsed-opts) (take (count docs-with-class) dwoc) @@ -544,7 +544,7 @@ split immediately you can use do-split-dataset." (concat (take (/ (:keep-n parsed-opts) 2) docs-with-class) (take (/ (:keep-n parsed-opts) 2) docs-without-class)) (concat docs-with-class docs-without-class)) - docs-shuffled (my-shuffle (sort-by :id docs-keep-n)) + docs-shuffled (shuffle (sort-by :id docs-keep-n)) ds (make-dataset :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}] (for [doc docs-shuffled] diff --git a/test/clj_ml/data_test.clj b/test/clj_ml/data_test.clj index b7beb17..d4ce413 100644 --- a/test/clj_ml/data_test.clj +++ b/test/clj_ml/data_test.clj @@ -229,3 +229,20 @@ [3 :bar] [4 :foo]])] (is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]])))) + +(deftest docs-to-dataset-test + (let [docs [{:id 10 + :title "Document title 1" + :fulltext "This is the fulltext..." + :has-class? false} + {:id 11 + :title "Another document title" + :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas" + :has-class? true}] + {:keys [dataset docids]} (docs-to-dataset docs "bananas-model" "." + :stemmer true :lowercase false) + docid-ds-vecs (apply hash-map (interleave docids (dataset-as-vecs dataset)))] + (is (= [:no 0.4804530139182014 0.0 0.4804530139182014 0.0] + (get docid-ds-vecs 10))) + (is (= [:yes 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014] + (get docid-ds-vecs 11)))))