Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 85 additions & 12 deletions biterm/btm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np
from itertools import combinations, chain
from tqdm import trange

from .vose_sampler import VoseAlias
import random
print ('fastbtm')

class oBTM:
""" Biterm Topic Model
Expand All @@ -10,51 +12,122 @@ class oBTM:
Thanks to jcapde for providing the code on https://github.com/jcapde/Biterm
"""

def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5):
def __init__(self, num_topics, V, alpha=1.0, beta=0.01, l=0.5):
self.K = num_topics
self.V = V
self.alpha = np.full(self.K, alpha)
self.beta = np.full((len(self.V), self.K), beta)
self.l = l

def compute_corpus_acceptance(self,n_z,n_wz,b_i,s_topic,t_topic):
doc_proposal_1 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[b_i[0],t_topic] + self.beta[b_i[0],t_topic])*(n_wz[b_i[1],t_topic] + self.beta[b_i[1],t_topic])
doc_proposal_2 = (n_z[s_topic] + self.alpha[s_topic])*(n_wz[b_i[0],s_topic] + self.beta[b_i[0],s_topic])*(n_wz[b_i[1],s_topic] + self.beta[b_i[1],s_topic])
doc_1 = doc_proposal_1/doc_proposal_2
doc_proposal_3 = ((2 * n_z[s_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[s_topic]+1+self.alpha[s_topic])
doc_proposal_4 = ((2 * n_z[t_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[t_topic]+1+self.alpha[t_topic])
doc_2 = doc_proposal_3/doc_proposal_4
doc_proposal = doc_1*doc_2
return min(1,doc_proposal)

def compute_term_proposal_1(self,wi,wj,n_wz,n_z,s_topic,t_topic):
return (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + (len(self.V)*self.beta[wi,s_topic]))**2)

def compute_term_proposal_2(self,wi,wj,n_wz,n_z,s_topic,t_topic):
return (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + (len(self.V)*self.beta[wi,t_topic]))**2)

def compute_term_proposal_3(self,wi,wj,n_wz,n_z,s_topic,t_topic):
return (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + (len(self.V)*self.beta[wi,t_topic]))

def compute_term_proposal_4(self,wi,wj,n_wz,n_z,s_topic,t_topic):
return (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + (len(self.V)*self.beta[wi,s_topic]))

def compute_term_acceptance(self,wi,wj,n_wz,n_z,s_topic,t_topic):
t1 = self.compute_term_proposal_1(wi,wj,n_wz,n_z,s_topic,t_topic)
t2 = self.compute_term_proposal_2(wi,wj,n_wz,n_z,s_topic,t_topic)
t = t1/t2
t3 = self.compute_term_proposal_3(wi,wj,n_wz,n_z,s_topic,t_topic)
t4 = self.compute_term_proposal_4(wi,wj,n_wz,n_z,s_topic,t_topic)
n = t3/t4
term_proposal = t*n
return min(1,term_proposal)

def _gibbs(self, iterations):

Z = np.zeros(len(self.B), dtype=np.int16)
n_wz = np.zeros((len(self.V), self.K), dtype=int)
n_z = np.zeros(self.K, dtype=int)
n_aw = np.zeros(len(self.V),dtype=object)
n_dw = np.zeros(len(self.B),dtype=int)

for i, b_i in enumerate(self.B):
topic = np.random.choice(self.K, 1)[0]
n_wz[b_i[0], topic] += 1
n_wz[b_i[1], topic] += 1
n_z[topic] += 1
Z[i] = topic
n_dw[i] = topic

for index,item in enumerate(n_wz):
n_aw[index] = VoseAlias(item.tolist())
#create alias table for each word

for _ in trange(iterations):
for i, b_i in enumerate(self.B):
n_wz[b_i[0], Z[i]] -= 1
n_wz[b_i[1], Z[i]] -= 1
n_z[Z[i]] -= 1
P_w0z = (n_wz[b_i[0], :] + self.beta[b_i[0], :]) / (2 * n_z + self.beta.sum(axis=0))
P_w1z = (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / (2 * n_z + 1 + self.beta.sum(axis=0))
P_z = (n_z + self.alpha) * P_w0z * P_w1z
# P_z = (n_z + self.alpha) * ((n_wz[b_i[0], :] + self.beta[b_i[0], :]) * (n_wz[b_i[1], :] + self.beta[b_i[1], :]) /
# (((n_wz + self.beta).sum(axis=0) + 1) * (n_wz + self.beta).sum(axis=0))) # todo check out
P_z = P_z / P_z.sum()
Z[i] = np.random.choice(self.K, 1, p=P_z)
proposal = np.random.randint(0,2)
k_topic = Z[i]
# index = np.random.randint(0,self.K)
# proposal_topic = n_dw[int(index)]
# if proposal == 0:
# mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic)
# else:
# proposal_topic = n_aw[b_i[0]].alias_generation()
# mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
# mh_sample = random.uniform(0,1)
# if (mh_sample < mh_acceptance):
# Z[i] = k_topic
# n_wz[b_i[0], Z[i]] += 1
# n_wz[b_i[1], Z[i]] += 1
# n_z[Z[i]] += 1
# else:
# Z[i] = proposal_topic
# n_wz[b_i[0], Z[i]] += 1
# n_wz[b_i[1], Z[i]] += 1
# n_z[Z[i]] += 1
for mh_step in range(1,2):
index = np.random.randint(0,self.K)
proposal_topic = n_dw[int(index)]
# proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1
mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic)
mh_sample = random.uniform(0,1)
if (mh_sample < mh_acceptance):
k_topic = proposal_topic
proposal_topic = n_aw[b_i[0]].alias_generation()
mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
mh_sample = random.uniform(0,1)
if (mh_sample < mh_acceptance):
k_topic = proposal_topic
proposal_topic = n_aw[b_i[1]].alias_generation()
mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic)
mh_sample = random.uniform(0,1)
if (mh_sample < mh_acceptance):
k_topic = proposal_topic
# n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]].tolist())
# n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]].tolist())
Z[i] = k_topic
n_wz[b_i[0], Z[i]] += 1
n_wz[b_i[1], Z[i]] += 1
n_z[Z[i]] += 1


#
return n_z, n_wz

def fit_transform(self, B_d, iterations):
self.fit(B_d, iterations)
return self.transform(B_d)

def fit(self, B_d, iterations):
print ("fastbtm")
self.B = list(chain(*B_d))
n_z, self.nwz = self._gibbs(iterations)

Expand Down
92 changes: 92 additions & 0 deletions biterm/vose_sampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/python

#LIBRARIES:
# Standard library
import os
import random
import re
import sys
from decimal import *
from optparse import OptionParser
#import cProfile

# Third-party libraries (only used temporarily for profiling)
#import memory_profiler


class VoseAlias(object):
""" A probability distribution for discrete weighted random variables and its probability/alias
tables for efficient sampling via Vose's Alias Method (a good explanation of which can be found at
http://www.keithschwarz.com/darts-dice-coins/).
"""

def __init__(self, probabilities):
""" (VoseAlias, dict) -> NoneType """
self.probabilities = probabilities
self.alias_initialisation()
self.table_prob_list = list(self.table_prob)

def alias_initialisation(self):
""" Construct probability and alias tables for the distribution. """
# Initialise variables
n = len(self.probabilities)
self.table_prob = {} # probability table
self.table_alias = {} # alias table
scaled_prob = {} # scaled probabilities
small = [] # stack for probabilities smaller that 1
large = [] # stack for probabilities greater than or equal to 1

# Construct and sort the scaled probabilities into their appropriate stacks
for o, p in enumerate(self.probabilities):
scaled_prob[o] = Decimal(p) * n

if scaled_prob[o] < 1:
small.append(o)
else:
large.append(o)

# Construct the probability and alias tables
while small and large:
s = small.pop()
l = large.pop()

self.table_prob[s] = scaled_prob[s]
self.table_alias[s] = l

scaled_prob[l] = (scaled_prob[l] + scaled_prob[s]) - Decimal(1)

if scaled_prob[l] < 1:
small.append(l)
else:
large.append(l)

# The remaining outcomes (of one stack) must have probability 1
while large:
self.table_prob[large.pop()] = Decimal(1)

while small:
self.table_prob[small.pop()] = Decimal(1)

def alias_generation(self):
""" Return a random outcome from the distribution. """
# Determine which column of table_prob to inspect
col = random.choice(self.table_prob_list)

# Determine which outcome to pick in that column
if self.table_prob[col] >= random.uniform(0,1):
return col
else:
return self.table_alias[col]

def main():
try:
test = VoseAlias([0.1,0.2,0.8])
import pdb
pdb.set_trace()
print (test.alias_generation())
except Exception as e:
sys.exit("\nError: %s" % e)


if __name__ == "__main__":
main()
8 changes: 4 additions & 4 deletions online_btm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
print("\n\n Train Online BTM ..")
for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
biterms_chunk = biterms[i:i + 100]
btm.fit(biterms_chunk, iterations=50)
btm.fit(biterms_chunk, iterations=100)
topics = btm.transform(biterms)

print("\n\n Visualize Topics ..")
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
pyLDAvis.save_html(vis, './vis/online_btm.html')
# print("\n\n Visualize Topics ..")
# vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
# pyLDAvis.save_html(vis, './vis/online_btm.html')

print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, X, vocab, 10)
Expand Down
6 changes: 3 additions & 3 deletions simple_btm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
print("\n\n Train BTM ..")
topics = btm.fit_transform(biterms, iterations=100)

print("\n\n Visualize Topics ..")
vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
pyLDAvis.save_html(vis, './vis/simple_btm.html')
# print("\n\n Visualize Topics ..")
# vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
# pyLDAvis.save_html(vis, './vis/simple_btm.html')

print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, X, vocab, 10)
Expand Down