背景
博客已经了一些技术文章,在写about页面的时候,想把自己tags分类展示在页面上,但是直接写文字,能不能在low点。。。想到了词云,于是写个脚本自动生产博客的词云图片,完美!!!。
环境
id | name | Version |
---|---|---|
1 | Python | 2.7 |
数据流
html数据源 -> html解析收集tag -> 词云生成图片正文
脚本是针对博客建立的,所以数据源取的就是博客文章的tag标签。脚本开发起来很简单,代码也不是很复杂,每个方法有基本的注释,代码如下:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118# -*- coding: utf-8 -*-
"""
------------------------------------------------
describe:
词云生成器
usage:
python tag_cloud.py
base_info:
__version__ = "v.10"
__author__ = "mingliang.gao"
__time__ = "2018/11/21"
__mail__ = "mingliang.gao@qunar.com"
------------------------------------------------
"""
import os
import re
import jieba
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
from bs4 import BeautifulSoup
BACKGROUND_NAME = 'public/images/tags_background.png'
HTML_REL_DIR = 'public/articles'
TAR_IMG = 'public/images/blog_tags.jpg'
def get_cur_dir():
return os.path.abspath(os.path.dirname(__file__))
class TagCloudGenerator(object):
def __init__(self):
self.background = os.path.join(get_cur_dir(), BACKGROUND_NAME)
self.text_source = os.path.join(get_cur_dir(), HTML_REL_DIR)
self.tar_img = os.path.join(get_cur_dir(), TAR_IMG)
def get_index_files(self):
"""
get all index.html file
:return: list type
"""
index_files = list()
pattern = re.compile('index\.html')
for root, dirs, files in os.walk(self.text_source):
for f in files:
f_full = os.path.join(root, f)
match = pattern.search(f_full)
index_files.append(f_full) if match else None
else:
return index_files
def get_index_tags(self, index_html):
"""
get tags of index.html
:param index_html: index html
:return: list type
"""
tags = list()
if not index_html:
return tags
if not os.path.exists(index_html):
return tags
def _deal_html_tas():
soup = BeautifulSoup(open(index_html), 'html.parser', from_encoding='utf-8')
post_a_tags = soup.find_all('a', attrs={"rel": "tag"})
if post_a_tags:
for tag in post_a_tags:
tag_text = tag.get_text()
tags.append(tag_text.strip())
return tags
return _deal_html_tas()
def collect_tags(self):
all_tags = list()
all_indexs = self.get_index_files()
for index_file in all_indexs:
if not index_file:
continue
tags = self.get_index_tags(index_file)
print index_file
all_tags.extend(tags) if tags else None
else:
return all_tags
def run(self):
all_html_tags = self.collect_tags() * 100
wl_space_split = " ".join(all_html_tags)
d = os.path.dirname(__file__)
tag_background = np.array(Image.open(self.background))
my_wordcloud = WordCloud(background_color="#CACFD2",
max_words=2000,
font_path="public/publicfiles/fangsong_GB2312.ttf",
mask=tag_background,
stopwords={'企业'},
max_font_size=150,
scale=1,
width=800,
random_state=1).generate(wl_space_split)
plt.imshow(my_wordcloud)
plt.axis("off")
# plt.show()
my_wordcloud.to_file(self.tar_img)
if __name__ == '__main__':
tc = TagCloudGenerator()
tc.run()
代码存在tag_cloud.py文件中,放在blog的根目录,图片生成的位置:blog/public/images/blog_tags.jpg。
执行方法
1 | python tag_cloud.py |
嵌入about
找到about页面的index.md文件,嵌入图片,代码如下:1
<img src="/images/blog_tags.jpg" alt="tags cloud" width="88%"/>