From e11690dd93b2685d0efa4d15059f0c9b3a88662c Mon Sep 17 00:00:00 2001 From: Joshua Eckroth Date: Sat, 12 Oct 2013 21:30:04 -0400 Subject: [PATCH] Don't shuffle docs in doc-to-dataset. --- src/clj_ml/data.clj | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/clj_ml/data.clj b/src/clj_ml/data.clj index 2ffea90..d2433f6 100644 --- a/src/clj_ml/data.clj +++ b/src/clj_ml/data.clj @@ -524,13 +524,13 @@ split immediately you can use do-split-dataset." [docs vocab term keep-n datadir & opts] (let [parsed-opts (apply hash-map opts) docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs) - docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))] + docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)] (if (:resample parsed-opts) (take (count docs-with-term) dwt) dwt)) - docs-keep-n (shuffle (if keep-n (concat (take (/ keep-n 2) docs-with-term) - (take (/ keep-n 2) docs-without-term)) - (concat docs-with-term docs-without-term))) + docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term) + (take (/ keep-n 2) docs-without-term)) + (concat docs-with-term docs-without-term)) ds (make-dataset :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}] (for [doc docs-keep-n]