from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
df = pd.read_csv("~/data/hab_test.csv", dtype={'hab_test.tot_visit': np.float32,'hab_test.tot_visit_ed': np.float32,'hab_test.tot_visit_acute': np.float32,'hab_test.age_in_2015': np.float32}, low_memory=False)
df.fillna(0)
df.columns = [feature.split('.')[1] for (feature_idx, feature) in enumerate(df.columns)]
# df.tot_visit_ed [df.tot_visit_ed <=5.0] = 'Less_than_5'
# df.tot_visit_ed [(df.tot_visit_ed > 5.0) & (df.tot_visit_ed <=10.0)] = 'Between_6_and_10'
# df.tot_visit_ed [df.tot_visit_ed >11.0] = 'More_than_11'
df.tot_visit[df.tot_visit <=5.0] = 0
df.tot_visit[(df.tot_visit > 5.0) & (df.tot_visit <=10.0)] = 1
df.tot_visit[df.tot_visit >10.0] = 2
# df.tot_visit_acute [df.tot_visit_acute <=5.0] = 'Less_than_5'
# df.tot_visit_acute [(df.tot_visit_acute > 5.0) & (df.tot_visit_acute <=10.0)] = 'Between_6_and_10'
# df.tot_visit_acute [df.tot_visit_acute >11.0] = 'More_than_11'
df.tot_visit_acute [df.tot_visit_acute <=5.0] = 0
df.tot_visit_acute [(df.tot_visit_acute > 5.0) & (df.tot_visit_acute <=10.0)] = 1
df.tot_visit_acute [df.tot_visit_acute >10.0] = 2
# df.age_in_2015 [df.age_in_2015 <=12.0] = 'child_less_than_13'
# df.age_in_2015 [(df.age_in_2015 > 12.0) & (df.tot_visit_acute <=19.0)] = 'Teenage_Between_13_and_19'
# df.age_in_2015 [(df.age_in_2015 > 19.0) & (df.tot_visit_acute <=30.0)] = 'Youth_Between_20_and_30'
# df.age_in_2015 [(df.age_in_2015 > 30.0) & (df.tot_visit_acute <=60.0)] = 'Mid_Between_30_and_60'
# df.age_in_2015 [df.age_in_2015 > 60.0] = 'More_than_60'
df.age_in_2015 [df.age_in_2015 <=12.0] = 0
df.age_in_2015 [(df.age_in_2015 > 12.0) & (df.age_in_2015 <=19.0)] = 1
df.age_in_2015 [(df.age_in_2015 > 19.0) & (df.age_in_2015 <=30.0)] = 2
df.age_in_2015 [(df.age_in_2015 > 30.0) & (df.age_in_2015 <=60.0)] = 3
df.age_in_2015 [df.age_in_2015 > 60.0] = 4
df = pd.get_dummies(data=df, columns=['tot_visit', 'age_in_2015','gender', 'latest_hospital_type','latest_adm_year', 'latest_adm_month'])
df = df.drop(labels = ['row_number', 'person_id', 'tot_visit_ed', 'tot_visit_acute'], axis=1)
print df
tf_feature_names = dict((feature_idx, feature) for (feature_idx, feature) in enumerate(df.columns))
# tf_matrix = df.fillna(0).as_matrix()
# print tf_matrix
# print tf_feature_names
# print df
no_topics = 5
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(df.fillna(0))
# def display_topics(model, feature_names, no_top_words):
# for topic_idx, topic in enumerate(model.components_):
# print "Topic %d:" % (topic_idx)
# print " ".join([feature_names[i]+":"+str(topic[i])
# for i in topic.argsort()[:-no_top_words - 1:-1]])
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
no_top_words = 5
display_topics(lda, tf_feature_names, no_top_words)