#coding=utf-8 import sysreload(sys)sys.setdefaultencoding('utf-8') import xlrdimport jiebaimport codecsimport csvimport numpy as npfrom wordcloud impo ...
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import xlrd
import jieba
import codecs
import csv
import numpy as np
from wordcloud import WordCloud
import jieba.posseg
import logging
def read_xlsx(filename):
workbook = xlrd.open_workbook(filename)
booksheet = workbook.sheet_by_name('Sheet1')
p = list()
count = 0
for row in range(booksheet.nrows):
count += 1
#if(count == 1000): break
row_data = []
for col in range(booksheet.ncols):
cel = booksheet.cell(row, col)
val = cel.value
try:
val = cel.value
val = re.sub(r'\s+', '', val)
except:
pass
if type(val) == float:
val = int(val)
else:
val = str( val )
row_data.append(val)
p.append(row_data)
print 'The size of p is ' + str(len(p))
return p
def seperate(p):
result = {}
count = 0
for i in p:
count += 1
if(count % 100 == 0): print 'Have seperate :# ' + str(count) + ' # words'
for j in i:
seg_list = jieba.posseg.cut(j)
try:
for k in seg_list:
if k.flag.startswith('n'):
v = 1
w = k.word
if result.has_key(w):
v = result[w]
v += 1
result[w] = v
except Exception,e:
print Exception,":",e
return result
def is_chinese(s):
rt = False
if s>= u"\u4e00" and s<= u"\u9fa6":
rt = True
return rt
def cutdict(p, top):
result = {}
biggerone = {}
vs = []
for (k,v) in p.items():
if(len(k) > 1 and is_chinese(k)):
vs.append(v)
biggerone[k] = v
vs.sort(reverse = True)
top_v = np.percentile(vs, top)
for (k,v) in biggerone.items():
if v >= top_v:
result[k] = v
return result
def initfile(filename):
csvfile = open(filename, 'wb')
csvfile.write(codecs.BOM_UTF8)
writer = csv.writer(csvfile)
writer.writerow(['詞','詞頻'])
return csvfile, writer
def ci2file(csvfile, writer, result):
for (k,v) in result.items():
if(len(k) > 1):
row = []
row.append(k)
row.append(v)
writer.writerow(row)
csvfile.flush()
csvfile.close()
p = read_xlsx('user_tweets_2.xlsx')
logging.info('Done read tweets')
result = seperate(p)
csvfile, writer = initfile('user_tweets_2.csv')
ci2file(csvfile, writer, result)
result = cutdict(result, 90)
logging.info('Done Cut result')
print 'The size of final result is ' + str(len(result))
# Generate a word cloud image 此處原為 text 方法,我們改用 frequencies
#wordcloud = WordCloud().generate(text
import random
# 形成獨特的灰黑色調
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
from PIL import Image
import matplotlib.pyplot as plt
mask = np.array(Image.open('timg2.png'))
logging.info('Done Read image')
wordcloud = WordCloud(max_words = 1000, mask = mask,
margin = 10,font_path='/Library/Fonts/華文仿宋.ttf')
wordcloud.fit_words(result)
default_colors = wordcloud.to_array()
plt.title("Custom colors")
plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3))
wordcloud.to_file("a_new_hope.png")
plt.axis("off")
plt.figure()
plt.title("Default colors")
plt.imshow(default_colors)
plt.axis("off")
plt.show()
# Display the generated image:
# the matplotlib way:
#plt.imshow(wordcloud)
#plt.axis("off")
#plt.show()
#from operator import itemgetter
#item1 = itemgetter(1)
#frequencies = sorted(result.items(), key=item1, reverse=True)
print 'done'