Enjoy the good life everyday!
关闭
欢迎来PyGo个人空间 ^_^
爬虫之公司网站首页Title、Keywords、Description | PyGo²

爬虫之公司网站首页Title、Keywords、Description

工作上涉及公司网站SEO优化相关的工作,但是在Title、Keywords、Description这块有点不清楚。于是,参考了一些与自己公司业务上有相同的公司网站内容,在进行词语分析选取一些关键词,在结合本公司的业务内容关键词进行综合,完成了网站SEO优化的Keywords

Python实战 爬虫教程系列

废话也不多说了,关于爬虫相关使用的教程在前面以及介绍了,而且还有栗子,这里主要贴上相关代码。除了爬取数据之外,本程序还把处理好的数据进行分析选取出现频率最多Keywords,以及把爬取的数据存储到excel中。

思路

抓取数据 -> 解析 -> 存储 -> 分析

特色

在本次爬虫中,运用了gevent协程,玩python不清楚协程的同学请自行脑补。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding: utf-8 -*-

"""
------------------------------------------------

describe:
用来抓取指定的物流公司官网的信息,包含title、keywords、description。

usage:
python comp_infos_grab.py


base_info:
__version__ = "v.10"
__author__ = "PyGo"
__time__ = "2019/12/3"
__mail__ = "gaoming971366@163.com"

------------------------------------------------
"""
import requests
import gevent
import xlrd
import xlwt
from gevent import monkey; monkey.patch_all()
from bs4 import BeautifulSoup
import jieba

PUBLIC_URL_LIST = {
"IML俄罗斯海外仓": "http://www.imlb2c.com/",
"旺集科技": "http://www.wangjigroup.com/",
"黑龙江俄速通国际物流有限公司": "http://www.ruston.cc/",
"AliExpress全球速卖通": "https://sell.aliexpress.com/zh/__pc/shipping/aliexpress_shipping.htm",
"中外运集装箱运输有限公司": "http://www.sinolines.com/",
"乐泰国际物流有限公司": "http://www.letaimzl.com/",
"NOEL诺艾尔集团": "http://www.noelworld.com/",
"慧承国际物流": "http://www.hcwuliu.com/",
"满洲里新颖国际货运代理有限公司": "http://www.mzlxinying.com/",
"运盟国际物流": "http://www.ym-trans.com/",
"如易科技": "http://www.ruecom.cn/"
}


class companyGrap(object):
_instance = None

def __init__(self):
super(companyGrap, self).__init__()

def __new__(cls, *args, **kwargs):
if companyGrap._instance is None:
companyGrap._instance = object.__new__(cls, *args, **kwargs)

return companyGrap._instance

def _get_infos(self, url):

results = dict()
results['url'] = url

if not url:
return results

payload = ""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
response = requests.get(url, data=payload, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
head = soup.head
titles = head.find_all('title')
tl = titles[0].string if titles else ""
results['title'] = tl
keywords = head.find_all('meta', attrs={'name': 'keywords'})
kw = keywords[0].attrs.get('content') if keywords else ""
results['keyword'] = kw
descriptions = head.find_all('meta', attrs={'name': 'description'})
desc = descriptions[0].attrs.get('content') if descriptions else ""
results['description'] = desc

return results

def to_excel(self, datas, exlname):
"""
generate data of excel format to save
:param datas: excel data
:param exlname: excel name
:return: None, excel data
"""
f = xlwt.Workbook(encoding='utf-8')
sheet = f.add_sheet('sheet', cell_overwrite_ok=True)
EXCEL_TITLES = ["ID", "NAME", "URL", 'TITLE', 'KEYWORDS', 'DESCRIPTION', "REMARK"]
BUSINESS = "BUSINESS"

style_title = xlwt.XFStyle()
font = xlwt.Font()
font.name = 'Times New Roman'
font.bold = True
font.color_index = 4
font.height = 220
style_title.font = font

style_content = xlwt.XFStyle()
font = xlwt.Font()
font.name = 'Times New Roman'
font.bold = False
font.color_index = 4
font.height = 220
style_content.font = font

# 标题
for i in range(0, len(EXCEL_TITLES)):
sheet.write(0, i, EXCEL_TITLES[i], style_title)

# 合并 && 重写
sheet.write_merge(0, 0, 3, 5, BUSINESS, style_title)
sheet.write_merge(0, 1, 0, 0, 'ID', style_title)
sheet.write_merge(0, 1, 1, 1, 'NAME', style_title)
sheet.write_merge(0, 1, 2, 2, 'URL', style_title)
sheet.write_merge(0, 1, 6, 6, 'REMARK', style_title)
for i in range(3, 6):
sheet.write(1, i, EXCEL_TITLES[i], style_content)

row = 2
count = 1
for line in datas:
sheet.write(row, 0, count, style_title)
sheet.write(row, 1, line.get('name'), style_content)
sheet.write(row, 2, line.get('url'), style_content)
sheet.write(row, 3, line.get('title'), style_content)
sheet.write(row, 4, line.get('keyword'), style_content)
sheet.write(row, 5, line.get('description'), style_content)
row += 1
count += 1

f.save(exlname)

def _deal_url(self, k, v):
return self._get_infos(v)

def to_generate_kw(self, datas):
keywords_src = ""
for data in datas:
if not data:
continue
keywords_src += data.get('keyword')

keywords = jieba.lcut(keywords_src, cut_all=False)
counts = dict()
for word in keywords:
if not word:
continue
if isinstance(word, unicode):
word = word.encode('utf-8')
if word in ('|', ',', ' ', '-', ','):
continue
counts[word] = counts.get(word, 0) + 1

ord_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
for k in ord_counts:
print "%s: %s" % (k[0], k[1])

def run(self, to_excel=False):
"""
process run
:param to_excel:
:return:
"""
jobs = list()
names = list()
excel_datas = list()
for k, v in PUBLIC_URL_LIST.iteritems():
if not k or not v:
continue
names.append(k)
jobs.append(gevent.spawn(self._deal_url, k, v))
gevent.joinall(jobs)
for name, job in zip(names, jobs):
value = job.value
print '==================%s==================' % name
print 'Title: %s' % value.get('title')
print 'Keyword: %s' % value.get('keyword')
print 'Description: %s' % value.get('description')
value['name'] = name
excel_datas.append(value)

self.to_generate_kw(excel_datas)

if to_excel:
print '---------excel ok'
self.to_excel(excel_datas, 'companys.xls')


if __name__ == '__main__':
companyGrap().run(to_excel=False)
  • 本文作者:mingliang.gao【一个爱老婆Python程序猿。。。。。。】
  • 本文链接: http://pygo2.top/articles/43681/
  • 版权声明: 本博客所有文章欢迎转载,转载请注明出处!
觉得有帮助 请偶坐个公交车
0%