From 4aa21a62719be5bb32f17ea1e7454742365257c2 Mon Sep 17 00:00:00 2001 From: Thomas Nyberg Date: Wed, 20 Sep 2017 17:32:01 +0200 Subject: [PATCH 1/4] Make LDAResults._expElogbeta() method constant The method was being computed each time it was called, but all data used to do the computation is constant throughout the life of the class. --- rosetta/text/vw_helpers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rosetta/text/vw_helpers.py b/rosetta/text/vw_helpers.py index dfcb857..93f8efc 100644 --- a/rosetta/text/vw_helpers.py +++ b/rosetta/text/vw_helpers.py @@ -332,6 +332,9 @@ def _set_probabilities(self, topics, predictions): self.pr_doc = doc_sums / doc_sums.sum() self.pr_doc_topic = predictions / predictions.sum().sum() + lam = self._lambda_word_sums * self.pr_token_topic + self._constExpElogbeta = np.exp(self._dirichlet_expectation(lam + EPS)) + def prob_token_topic(self, token=None, topic=None, c_token=None, c_topic=None): """ @@ -565,9 +568,7 @@ def _expElogbeta(self): topic-word weights. """ # Get lambda, the dirichlet parameter originally returned by VW. - lam = self._lambda_word_sums * self.pr_token_topic - - return np.exp(self._dirichlet_expectation(lam + EPS)) + return self._constExpElogbeta def _dirichlet_expectation(self, alpha): """ From 63e116584d25bab24ad895684b9dddbe55c331e8 Mon Sep 17 00:00:00 2001 From: Thomas Nyberg Date: Fri, 22 Sep 2017 11:06:09 +0200 Subject: [PATCH 2/4] Add missing dependencies to requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 58c53ba..03b800e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,6 @@ pyth pymongo MySQL-python scipy +unidecode +multiprocess +nltk From 8f73f7063ea1b07c5001d5b0b4ceb979daa09cc9 Mon Sep 17 00:00:00 2001 From: Thomas Nyberg Date: Mon, 2 Oct 2017 09:50:02 +0200 Subject: [PATCH 3/4] Pin pandas to version == 0.16.2 There have been backwards-incompatible changes in later versions of pandas for which rosetta has not been updated. Until rosetta is updated, the version should be pinned to reflect this. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 03b800e..6fb272a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pandas +pandas==0.16.2 scikit-learn statsmodels gensim From a574a0ea47d4410415f1686b20827f638d251dd9 Mon Sep 17 00:00:00 2001 From: Thomas Nyberg Date: Mon, 2 Oct 2017 09:52:47 +0200 Subject: [PATCH 4/4] Remove TestLDAResults.test_expElogbeta() This test was using unexposed internal class methods in a way which was incompatible with legitimate usage. It worked before due to how the internals of the class were desiged, but no longer due to the fact that the LDAResults._expElogbeta() method is const. --- rosetta/tests/test_text.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/rosetta/tests/test_text.py b/rosetta/tests/test_text.py index d26d470..3bf9f90 100644 --- a/rosetta/tests/test_text.py +++ b/rosetta/tests/test_text.py @@ -323,16 +323,6 @@ def test_dirichlet_expectation(self): [-0.13470677, -13.32429878]]).T assert_allclose(result, benchmark, atol=1e-4) - def test_expElogbeta(self): - # Make sure equal to exponential of dirichlet_expectation when we - # pass in all ones - lda = self.choose_lda('lda') - lda._lambda_word_sums = pd.Series( - np.ones(lda.num_topics), index=lda.topics) - result = lda._expElogbeta - benchmark = np.exp(lda._dirichlet_expectation(lda.pr_token_topic)) - assert_frame_equal(result, benchmark) - def test_predict_1(self): # Use fact that w0 <--> topic_0, w1 <--> topic_1 lda = self.choose_lda('lda_2')