Fixed some bugs in new docs-to-dataset, added test.
This commit is contained in:
parent
05b59b0cfe
commit
19ce704093
3 changed files with 25 additions and 6 deletions
10
README.md
10
README.md
|
@ -307,12 +307,14 @@ user> (classifier-classify classifier2 instance)
|
||||||
Text document handling:
|
Text document handling:
|
||||||
|
|
||||||
```clojure
|
```clojure
|
||||||
user> (def docs [{:title "Document title 1"
|
user> (def docs [{:id 10
|
||||||
|
:title "Document title 1"
|
||||||
:fulltext "This is the fulltext..."
|
:fulltext "This is the fulltext..."
|
||||||
:has-term? false}
|
:has-class? false}
|
||||||
{:title "Another document title"
|
{:id 11
|
||||||
|
:title "Another document title"
|
||||||
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
|
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
|
||||||
:has-term? true}])
|
:has-class? true}])
|
||||||
#'user/docs
|
#'user/docs
|
||||||
|
|
||||||
user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)
|
user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)
|
||||||
|
|
|
@ -535,7 +535,7 @@ split immediately you can use do-split-dataset."
|
||||||
[docs model-prefix model-dir & opts]
|
[docs model-prefix model-dir & opts]
|
||||||
(let [parsed-opts (apply hash-map opts)
|
(let [parsed-opts (apply hash-map opts)
|
||||||
original-ordering (map :id docs)
|
original-ordering (map :id docs)
|
||||||
docs-with-class (filter :has-class? doc)
|
docs-with-class (filter :has-class? docs)
|
||||||
docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
|
docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
|
||||||
(if (:resample parsed-opts)
|
(if (:resample parsed-opts)
|
||||||
(take (count docs-with-class) dwoc)
|
(take (count docs-with-class) dwoc)
|
||||||
|
@ -544,7 +544,7 @@ split immediately you can use do-split-dataset."
|
||||||
(concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
|
(concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
|
||||||
(take (/ (:keep-n parsed-opts) 2) docs-without-class))
|
(take (/ (:keep-n parsed-opts) 2) docs-without-class))
|
||||||
(concat docs-with-class docs-without-class))
|
(concat docs-with-class docs-without-class))
|
||||||
docs-shuffled (my-shuffle (sort-by :id docs-keep-n))
|
docs-shuffled (shuffle (sort-by :id docs-keep-n))
|
||||||
ds (make-dataset
|
ds (make-dataset
|
||||||
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
||||||
(for [doc docs-shuffled]
|
(for [doc docs-shuffled]
|
||||||
|
|
|
@ -229,3 +229,20 @@
|
||||||
[3 :bar]
|
[3 :bar]
|
||||||
[4 :foo]])]
|
[4 :foo]])]
|
||||||
(is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]]))))
|
(is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]]))))
|
||||||
|
|
||||||
|
(deftest docs-to-dataset-test
|
||||||
|
(let [docs [{:id 10
|
||||||
|
:title "Document title 1"
|
||||||
|
:fulltext "This is the fulltext..."
|
||||||
|
:has-class? false}
|
||||||
|
{:id 11
|
||||||
|
:title "Another document title"
|
||||||
|
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
|
||||||
|
:has-class? true}]
|
||||||
|
{:keys [dataset docids]} (docs-to-dataset docs "bananas-model" "."
|
||||||
|
:stemmer true :lowercase false)
|
||||||
|
docid-ds-vecs (apply hash-map (interleave docids (dataset-as-vecs dataset)))]
|
||||||
|
(is (= [:no 0.4804530139182014 0.0 0.4804530139182014 0.0]
|
||||||
|
(get docid-ds-vecs 10)))
|
||||||
|
(is (= [:yes 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014]
|
||||||
|
(get docid-ds-vecs 11)))))
|
||||||
|
|
Loading…
Reference in a new issue