Latent Dirichlet Allocation (LDA)

2017-06-29
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

df = pd.read_csv("~/data/hab_test.csv", dtype={'hab_test.tot_visit': np.float32,'hab_test.tot_visit_ed': np.float32,'hab_test.tot_visit_acute': np.float32,'hab_test.age_in_2015': np.float32}, low_memory=False)
df.fillna(0)

df.columns = [feature.split('.')[1] for (feature_idx, feature) in enumerate(df.columns)]


# df.tot_visit_ed [df.tot_visit_ed <=5.0] = 'Less_than_5'
# df.tot_visit_ed [(df.tot_visit_ed > 5.0) & (df.tot_visit_ed <=10.0)] = 'Between_6_and_10'
# df.tot_visit_ed [df.tot_visit_ed >11.0] = 'More_than_11'

df.tot_visit[df.tot_visit <=5.0] = 0
df.tot_visit[(df.tot_visit > 5.0) & (df.tot_visit <=10.0)] = 1
df.tot_visit[df.tot_visit >10.0] = 2

# df.tot_visit_acute [df.tot_visit_acute <=5.0] = 'Less_than_5'
# df.tot_visit_acute [(df.tot_visit_acute > 5.0) & (df.tot_visit_acute <=10.0)] = 'Between_6_and_10'
# df.tot_visit_acute [df.tot_visit_acute >11.0] = 'More_than_11'

df.tot_visit_acute [df.tot_visit_acute <=5.0] = 0
df.tot_visit_acute [(df.tot_visit_acute > 5.0) & (df.tot_visit_acute <=10.0)] = 1
df.tot_visit_acute [df.tot_visit_acute >10.0] = 2

# df.age_in_2015 [df.age_in_2015 <=12.0] = 'child_less_than_13'
# df.age_in_2015 [(df.age_in_2015 > 12.0) & (df.tot_visit_acute <=19.0)] = 'Teenage_Between_13_and_19'
# df.age_in_2015 [(df.age_in_2015 > 19.0) & (df.tot_visit_acute <=30.0)] = 'Youth_Between_20_and_30'
# df.age_in_2015 [(df.age_in_2015 > 30.0) & (df.tot_visit_acute <=60.0)] = 'Mid_Between_30_and_60'
# df.age_in_2015 [df.age_in_2015 > 60.0] = 'More_than_60'

df.age_in_2015 [df.age_in_2015 <=12.0] = 0
df.age_in_2015 [(df.age_in_2015 > 12.0) & (df.age_in_2015 <=19.0)] = 1
df.age_in_2015 [(df.age_in_2015 > 19.0) & (df.age_in_2015 <=30.0)] = 2
df.age_in_2015 [(df.age_in_2015 > 30.0) & (df.age_in_2015 <=60.0)] = 3
df.age_in_2015 [df.age_in_2015 > 60.0] = 4

df = pd.get_dummies(data=df, columns=['tot_visit', 'age_in_2015','gender',  'latest_hospital_type','latest_adm_year',  'latest_adm_month'])
df = df.drop(labels = ['row_number',  'person_id',  'tot_visit_ed', 'tot_visit_acute'], axis=1)

print df

tf_feature_names = dict((feature_idx, feature) for (feature_idx, feature) in enumerate(df.columns))
 
# tf_matrix = df.fillna(0).as_matrix()
# print tf_matrix
# print tf_feature_names
# print df
  
no_topics = 5
  
  
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(df.fillna(0))
  
# def display_topics(model, feature_names, no_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print "Topic %d:" % (topic_idx)
#         print " ".join([feature_names[i]+":"+str(topic[i])
#                         for i in topic.argsort()[:-no_top_words - 1:-1]])
        
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
  
no_top_words = 5
display_topics(lda, tf_feature_names, no_top_words)
Andrew Peng

Xueping Peng

Latent Dirichlet Allocation (LDA)