Fixed some bugs in new docs-to-dataset, added test.

This commit is contained in:
Joshua Eckroth 2013-12-24 08:21:17 -05:00
parent 05b59b0cfe
commit 19ce704093
3 changed files with 25 additions and 6 deletions

View file

@ -307,12 +307,14 @@ user> (classifier-classify classifier2 instance)
Text document handling: Text document handling:
```clojure ```clojure
user> (def docs [{:title "Document title 1" user> (def docs [{:id 10
:title "Document title 1"
:fulltext "This is the fulltext..." :fulltext "This is the fulltext..."
:has-term? false} :has-class? false}
{:title "Another document title" {:id 11
:title "Another document title"
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas" :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
:has-term? true}]) :has-class? true}])
#'user/docs #'user/docs
user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false) user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)

View file

@ -535,7 +535,7 @@ split immediately you can use do-split-dataset."
[docs model-prefix model-dir & opts] [docs model-prefix model-dir & opts]
(let [parsed-opts (apply hash-map opts) (let [parsed-opts (apply hash-map opts)
original-ordering (map :id docs) original-ordering (map :id docs)
docs-with-class (filter :has-class? doc) docs-with-class (filter :has-class? docs)
docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)] docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
(if (:resample parsed-opts) (if (:resample parsed-opts)
(take (count docs-with-class) dwoc) (take (count docs-with-class) dwoc)
@ -544,7 +544,7 @@ split immediately you can use do-split-dataset."
(concat (take (/ (:keep-n parsed-opts) 2) docs-with-class) (concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
(take (/ (:keep-n parsed-opts) 2) docs-without-class)) (take (/ (:keep-n parsed-opts) 2) docs-without-class))
(concat docs-with-class docs-without-class)) (concat docs-with-class docs-without-class))
docs-shuffled (my-shuffle (sort-by :id docs-keep-n)) docs-shuffled (shuffle (sort-by :id docs-keep-n))
ds (make-dataset ds (make-dataset
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}] :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
(for [doc docs-shuffled] (for [doc docs-shuffled]

View file

@ -229,3 +229,20 @@
[3 :bar] [3 :bar]
[4 :foo]])] [4 :foo]])]
(is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]])))) (is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]]))))
(deftest docs-to-dataset-test
(let [docs [{:id 10
:title "Document title 1"
:fulltext "This is the fulltext..."
:has-class? false}
{:id 11
:title "Another document title"
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
:has-class? true}]
{:keys [dataset docids]} (docs-to-dataset docs "bananas-model" "."
:stemmer true :lowercase false)
docid-ds-vecs (apply hash-map (interleave docids (dataset-as-vecs dataset)))]
(is (= [:no 0.4804530139182014 0.0 0.4804530139182014 0.0]
(get docid-ds-vecs 10)))
(is (= [:yes 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014]
(get docid-ds-vecs 11)))))