Don't shuffle docs in doc-to-dataset.

This commit is contained in:
Joshua Eckroth 2013-10-12 21:30:04 -04:00
parent c7c7cdd9f1
commit e11690dd93

View file

@ -524,13 +524,13 @@ split immediately you can use do-split-dataset."
[docs vocab term keep-n datadir & opts] [docs vocab term keep-n datadir & opts]
(let [parsed-opts (apply hash-map opts) (let [parsed-opts (apply hash-map opts)
docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs) docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))] docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)]
(if (:resample parsed-opts) (if (:resample parsed-opts)
(take (count docs-with-term) dwt) (take (count docs-with-term) dwt)
dwt)) dwt))
docs-keep-n (shuffle (if keep-n (concat (take (/ keep-n 2) docs-with-term) docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
(take (/ keep-n 2) docs-without-term)) (take (/ keep-n 2) docs-without-term))
(concat docs-with-term docs-without-term))) (concat docs-with-term docs-without-term))
ds (make-dataset ds (make-dataset
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}] :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
(for [doc docs-keep-n] (for [doc docs-keep-n]