From 1105dac7b8ff297b4b0c17a925c4cb0c734f660c Mon Sep 17 00:00:00 2001 From: Joshua Eckroth Date: Tue, 16 Jul 2013 23:51:12 -0400 Subject: [PATCH] Don't limit the size of fulltext in docs-to-dataset --- src/clj_ml/data.clj | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/clj_ml/data.clj b/src/clj_ml/data.clj index 5c8282d..0eb119b 100644 --- a/src/clj_ml/data.clj +++ b/src/clj_ml/data.clj @@ -523,11 +523,10 @@ split immediately you can use do-split-dataset." (for [doc docs-keep-n] (let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) ""))) #"\s+" " ") - fulltext-sub (str/replace (subs fulltext 0 (min (count fulltext) 10000)) - #"[^ \w\d]" "") + fulltext-fixed (str/replace fulltext #"[^ \w\d]" "") title (str/replace (:title doc "") #"[^ \w\d]" "") has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))] - [(if has-tid? :yes :no) title fulltext-sub]))) + [(if has-tid? :yes :no) title fulltext-fixed]))) ds-title (let [f (filters/make-filter :string-to-word-vector {:dataset-format ds