markoarnauto · BurakaKrishna · Aug 2, 2019 · Aug 2, 2019 · Aug 6, 2019
diff --git a/biterm/btm.py b/biterm/btm.py
@@ -1,7 +1,9 @@
 import numpy as np
 from itertools import combinations, chain
 from tqdm import trange
-
+from .vose_sampler import VoseAlias
+import random
+print ('fastbtm')
 
 class oBTM:
     """ Biterm Topic Model
@@ -10,51 +12,122 @@ class oBTM:
         Thanks to jcapde for providing the code on https://github.com/jcapde/Biterm
     """
 
-    def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5):
+    def __init__(self, num_topics, V, alpha=1.0, beta=0.01, l=0.5):
         self.K = num_topics
         self.V = V
         self.alpha = np.full(self.K, alpha)
         self.beta = np.full((len(self.V), self.K), beta)
         self.l = l
 
+    def compute_corpus_acceptance(self,n_z,n_wz,b_i,s_topic,t_topic):
+        doc_proposal_1 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[b_i[0],t_topic] + self.beta[b_i[0],t_topic])*(n_wz[b_i[1],t_topic] + self.beta[b_i[1],t_topic])
+        doc_proposal_2 = (n_z[s_topic] + self.alpha[s_topic])*(n_wz[b_i[0],s_topic] + self.beta[b_i[0],s_topic])*(n_wz[b_i[1],s_topic] + self.beta[b_i[1],s_topic]) 
+        doc_1 = doc_proposal_1/doc_proposal_2
+        doc_proposal_3 = ((2 * n_z[s_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[s_topic]+1+self.alpha[s_topic])
+        doc_proposal_4 =  ((2 * n_z[t_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[t_topic]+1+self.alpha[t_topic])
+        doc_2 = doc_proposal_3/doc_proposal_4
+        doc_proposal = doc_1*doc_2
+        return min(1,doc_proposal)
+
+    def compute_term_proposal_1(self,wi,wj,n_wz,n_z,s_topic,t_topic):
+        return (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + (len(self.V)*self.beta[wi,s_topic]))**2)
+
+    def compute_term_proposal_2(self,wi,wj,n_wz,n_z,s_topic,t_topic):
+        return (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + (len(self.V)*self.beta[wi,t_topic]))**2)
+
+    def compute_term_proposal_3(self,wi,wj,n_wz,n_z,s_topic,t_topic):
+        return (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + (len(self.V)*self.beta[wi,t_topic]))
+
+    def compute_term_proposal_4(self,wi,wj,n_wz,n_z,s_topic,t_topic):
+        return (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + (len(self.V)*self.beta[wi,s_topic]))
+
+    def compute_term_acceptance(self,wi,wj,n_wz,n_z,s_topic,t_topic):
+        t1 = self.compute_term_proposal_1(wi,wj,n_wz,n_z,s_topic,t_topic)
+        t2 = self.compute_term_proposal_2(wi,wj,n_wz,n_z,s_topic,t_topic)
+        t = t1/t2
+        t3 = self.compute_term_proposal_3(wi,wj,n_wz,n_z,s_topic,t_topic)
+        t4 = self.compute_term_proposal_4(wi,wj,n_wz,n_z,s_topic,t_topic)
+        n = t3/t4
+        term_proposal = t*n
+        return min(1,term_proposal)
 
     def _gibbs(self, iterations):
-
         Z = np.zeros(len(self.B), dtype=np.int16)
         n_wz = np.zeros((len(self.V), self.K), dtype=int)
         n_z = np.zeros(self.K, dtype=int)
+        n_aw = np.zeros(len(self.V),dtype=object)
+        n_dw = np.zeros(len(self.B),dtype=int)
 
         for i, b_i in enumerate(self.B):
             topic = np.random.choice(self.K, 1)[0]
             n_wz[b_i[0], topic] += 1
             n_wz[b_i[1], topic] += 1
             n_z[topic] += 1
             Z[i] = topic
+            n_dw[i] = topic
+
+        for index,item in enumerate(n_wz):
+            n_aw[index] = VoseAlias(item.tolist())
+        #create alias table for each word
 
         for _ in trange(iterations):
             for i, b_i in enumerate(self.B):
                 n_wz[b_i[0], Z[i]] -= 1
                 n_wz[b_i[1], Z[i]] -= 1
                 n_z[Z[i]] -= 1
-                P_w0z = (n_wz[b_i[0], :] + self.beta[b_i[0], :]) / (2 * n_z + self.beta.sum(axis=0))
-                P_w1z = (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / (2 * n_z + 1 + self.beta.sum(axis=0))
-                P_z = (n_z + self.alpha) * P_w0z * P_w1z
-                # P_z = (n_z + self.alpha) * ((n_wz[b_i[0], :] + self.beta[b_i[0], :]) * (n_wz[b_i[1], :] + self.beta[b_i[1], :]) /
-                #                            (((n_wz + self.beta).sum(axis=0) + 1) * (n_wz + self.beta).sum(axis=0)))  # todo check out
-                P_z = P_z / P_z.sum()
-                Z[i] = np.random.choice(self.K, 1, p=P_z)
+                proposal = np.random.randint(0,2)
+                k_topic = Z[i]
+                # index = np.random.randint(0,self.K)
+                # proposal_topic = n_dw[int(index)]
+                # if proposal == 0:
+                #     mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic)
+                # else:
+                #     proposal_topic = n_aw[b_i[0]].alias_generation()
+                #     mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
+                # mh_sample = random.uniform(0,1)
+                # if (mh_sample < mh_acceptance):
+                #     Z[i] = k_topic
+                #     n_wz[b_i[0], Z[i]] += 1
+                #     n_wz[b_i[1], Z[i]] += 1
+                #     n_z[Z[i]] += 1
+                # else:
+                #     Z[i] = proposal_topic
+                #     n_wz[b_i[0], Z[i]] += 1
+                #     n_wz[b_i[1], Z[i]] += 1
+                #     n_z[Z[i]] += 1                    
+                for mh_step in range(1,2):
+                    index = np.random.randint(0,self.K)
+                    proposal_topic = n_dw[int(index)]
+                    # proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1
+                    mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic)
+                    mh_sample = random.uniform(0,1)
+                    if (mh_sample < mh_acceptance):
+                        k_topic = proposal_topic
+                    proposal_topic = n_aw[b_i[0]].alias_generation()
+                    mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
+                    mh_sample = random.uniform(0,1)
+                    if (mh_sample < mh_acceptance):
+                        k_topic = proposal_topic
+                    proposal_topic = n_aw[b_i[1]].alias_generation()
+                    mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
+                    mh_sample = random.uniform(0,1)
+                    if (mh_sample < mh_acceptance):
+                        k_topic = proposal_topic
+                # n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]].tolist())
+                # n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]].tolist())
+                Z[i] = k_topic
                 n_wz[b_i[0], Z[i]] += 1
                 n_wz[b_i[1], Z[i]] += 1
                 n_z[Z[i]] += 1
-
-
+                #   
         return n_z, n_wz
 
     def fit_transform(self, B_d, iterations):
        self.fit(B_d, iterations)
        return self.transform(B_d)
 
     def fit(self, B_d, iterations):
+        print ("fastbtm")
         self.B = list(chain(*B_d))
         n_z, self.nwz = self._gibbs(iterations)
 

diff --git a/biterm/vose_sampler.py b/biterm/vose_sampler.py
@@ -0,0 +1,92 @@
+#!/usr/bin/python
+
+#LIBRARIES:
+# Standard library
+import os
+import random
+import re
+import sys
+from decimal import *
+from optparse import OptionParser
+#import cProfile
+
+# Third-party libraries (only used temporarily for profiling)
+#import memory_profiler
+
+
+class VoseAlias(object):
+    """ A probability distribution for discrete weighted random variables and its probability/alias
+    tables for efficient sampling via Vose's Alias Method (a good explanation of which can be found at
+    http://www.keithschwarz.com/darts-dice-coins/).
+    """
+
+    def __init__(self, probabilities):
+        """ (VoseAlias, dict) -> NoneType """
+        self.probabilities = probabilities
+        self.alias_initialisation()
+        self.table_prob_list = list(self.table_prob)
+
+    def alias_initialisation(self):
+        """ Construct probability and alias tables for the distribution. """
+        # Initialise variables
+        n = len(self.probabilities)
+        self.table_prob = {}   # probability table
+        self.table_alias = {}  # alias table
+        scaled_prob = {}       # scaled probabilities
+        small = []             # stack for probabilities smaller that 1
+        large = []             # stack for probabilities greater than or equal to 1
+
+        # Construct and sort the scaled probabilities into their appropriate stacks
+        for o, p in enumerate(self.probabilities):
+            scaled_prob[o] = Decimal(p) * n
+
+            if scaled_prob[o] < 1:
+                small.append(o)
+            else:
+                large.append(o)
+
+        # Construct the probability and alias tables
+        while small and large:
+            s = small.pop()
+            l = large.pop()
+
+            self.table_prob[s] = scaled_prob[s]
+            self.table_alias[s] = l
+
+            scaled_prob[l] = (scaled_prob[l] + scaled_prob[s]) - Decimal(1)
+
+            if scaled_prob[l] < 1:
+                small.append(l)
+            else:
+                large.append(l)
+
+        # The remaining outcomes (of one stack) must have probability 1
+        while large:
+            self.table_prob[large.pop()] = Decimal(1)
+
+        while small:
+            self.table_prob[small.pop()] = Decimal(1)
+
+    def alias_generation(self):
+        """ Return a random outcome from the distribution. """
+        # Determine which column of table_prob to inspect
+        col = random.choice(self.table_prob_list)
+
+        # Determine which outcome to pick in that column
+        if self.table_prob[col] >= random.uniform(0,1):
+            return col
+        else:
+            return self.table_alias[col]
+
+def main():
+    try:
+        test = VoseAlias([0.1,0.2,0.8])
+        import pdb
+        pdb.set_trace()
+        print (test.alias_generation())
+    except Exception as e:
+        sys.exit("\nError: %s" % e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/online_btm.py b/online_btm.py
@@ -25,12 +25,12 @@
     print("\n\n Train Online BTM ..")
     for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
         biterms_chunk = biterms[i:i + 100]
-        btm.fit(biterms_chunk, iterations=50)
+        btm.fit(biterms_chunk, iterations=100)
     topics = btm.transform(biterms)
 
-    print("\n\n Visualize Topics ..")
-    vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
-    pyLDAvis.save_html(vis, './vis/online_btm.html')
+    # print("\n\n Visualize Topics ..")
+    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
+    # pyLDAvis.save_html(vis, './vis/online_btm.html')
 
     print("\n\n Topic coherence ..")
     topic_summuary(btm.phi_wz.T, X, vocab, 10)

diff --git a/simple_btm.py b/simple_btm.py
@@ -24,9 +24,9 @@
     print("\n\n Train BTM ..")
     topics = btm.fit_transform(biterms, iterations=100)
 
-    print("\n\n Visualize Topics ..")
-    vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
-    pyLDAvis.save_html(vis, './vis/simple_btm.html')
+    # print("\n\n Visualize Topics ..")
+    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
+    # pyLDAvis.save_html(vis, './vis/simple_btm.html')
 
     print("\n\n Topic coherence ..")
     topic_summuary(btm.phi_wz.T, X, vocab, 10)