Added function clj-ml.data/docs-to-dataset to support translating text documents (with title, fulltext, and terms) into wordvec datasets for binary classification.

Bumped to version 0.3.10.
This commit is contained in:
Joshua Eckroth 2013-07-04 08:53:30 -04:00
parent 123cd1713c
commit 6f03716d0a
2 changed files with 71 additions and 1 deletions

View file

@ -1,4 +1,4 @@
(defproject cc.artifice/clj-ml "0.3.9" (defproject cc.artifice/clj-ml "0.3.10"
:description "Machine Learning library for Clojure built around Weka and friends" :description "Machine Learning library for Clojure built around Weka and friends"
:java-source-paths ["src/java"] :java-source-paths ["src/java"]
:license {:name "MIT License" :license {:name "MIT License"
@ -9,6 +9,7 @@
[incanter/incanter-charts "1.4.1"] [incanter/incanter-charts "1.4.1"]
[nz.ac.waikato.cms.weka/weka-stable "3.6.9"] [nz.ac.waikato.cms.weka/weka-stable "3.6.9"]
[tw.edu.ntu.csie/libsvm "3.1"] [tw.edu.ntu.csie/libsvm "3.1"]
[org.jsoup/jsoup "1.7.2"]
[org.apache.lucene/lucene-analyzers-common "4.3.0"] [org.apache.lucene/lucene-analyzers-common "4.3.0"]
[org.clojars.chapmanb/fast-random-forest "0.98"]] [org.clojars.chapmanb/fast-random-forest "0.98"]]
:profiles {:dev :profiles {:dev

View file

@ -11,7 +11,12 @@
the instances. Finally data sets can be transformed into Clojure sequences the instances. Finally data sets can be transformed into Clojure sequences
that can be transformed using usual Clojure functions like map, reduce, etc." that can be transformed using usual Clojure functions like map, reduce, etc."
(:use [clj-ml utils]) (:use [clj-ml utils])
(:use [clj-ml.io :only [load-instances save-instances]])
(:require [clj-ml.filters :as filters]) (:require [clj-ml.filters :as filters])
(:require [clojure.string :as str])
(:require [clojure.set :as set])
(:use [clojure.java.io :only [file]])
(:import [org.jsoup Jsoup])
(:import (weka.core Instance Instances FastVector Attribute) (:import (weka.core Instance Instances FastVector Attribute)
(cljml ClojureInstances))) (cljml ClojureInstances)))
@ -493,3 +498,67 @@ split immediately you can use do-split-dataset."
"Returns a subset of the given dataset containing the first 'num' instances." "Returns a subset of the given dataset containing the first 'num' instances."
[ds num] [ds num]
(filters/remove-range ds {:range (str "first-" num) :invert true})) (filters/remove-range ds {:range (str "first-" num) :invert true}))
;; text-document datasets
(defn dataset-filename
[datadir vocab-id term-id tag]
(format "%s/instances/%s-%d-%s.arff" datadir vocab-id term-id (name tag)))
(defn docs-to-dataset
[docs vocab-id vocab-name term-id term-name keep-n datadir & opts]
(let [parsed-opts (apply hash-map opts)
docs-with-term (filter (fn [doc] (some #(= term-name %)
(get-in doc [:terms vocab-name])))
docs)
docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))]
(if (:resample parsed-opts)
(take (count docs-with-term) dwt)
dwt))
docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
(take (/ keep-n 2) docs-without-term))
(concat docs-with-term docs-without-term))
ds (make-dataset
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
(for [doc docs-keep-n]
(let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
#"\s+" " ")
fulltext-sub (subs fulltext 0 (min (count fulltext) 10000))
title (:title doc "")
has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))]
[(if has-tid? :yes :no) title fulltext-sub])))
ds-title (let [f (filters/make-filter
:string-to-word-vector
{:dataset-format ds
:attributes [1]
:lowercase (:lowercase parsed-opts true)
:prefix "title-" :words-to-keep (:words-to-keep parsed-opts 1000)
:transform-tf (:transform-tf parsed-opts true)
:transform-idf (:transform-idf parsed-opts true)
:stemmer (if (:stemmer parsed-opts false)
"weka.core.stemmers.SnowballStemmer -S English")})]
;; if testing, initialize the filter with the training instances
(when (:testing parsed-opts)
(filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :orig)))))
(filters/filter-apply f ds))
ds-title-fulltext (let [f (filters/make-filter
:string-to-word-vector
{:dataset-format ds-title
:attributes [1]
:lowercase (:lowercase parsed-opts true)
:prefix "fulltext-" :words-to-keep (:words-to-keep parsed-opts 1000)
:transform-tf (:transform-tf parsed-opts true)
:transform-idf (:transform-idf parsed-opts true)
:stemmer (if (:stemmer parsed-opts false)
"weka.core.stemmers.SnowballStemmer -S English")})]
;; if testing, initialize the filter with the training instances
(when (:testing parsed-opts)
(filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :title)))))
(filters/filter-apply f ds-title))
ds-class (dataset-set-class ds-title-fulltext 0)]
;; if training, save unfiltered instances to re-initialize filter later
(when (:training parsed-opts)
(save-instances :arff (file (dataset-filename datadir vocab-id term-id :orig)) ds)
(save-instances :arff (file (dataset-filename datadir vocab-id term-id :title)) ds-title))
ds-class))