爬虫简单教程

0x00 预备知识

1
2
3
4
# html
# http协议 -- http是爬虫基础,了解http请求头,特别User-Agent、cookie作用(对于爬虫)
# js
# python -- 编写爬虫主流语言

0x01 基础Python库

1
2
3
# urllib
# bs4.BeautifulSoup -- 解析库 lxml(推荐)
# re -- 正则表达式

0x02 urllib库-学习

  1. urllib.request get方法

    1
    2
    response = urllib.request.urlopen("http://www.baidu.com")
    print(response.read().decode("utf-8"))
  2. urllib.request post方法

    1
    2
    3
    4
    import urllib.parse
    data = bytes(urllib.parse.urlencode({"hello" : "world"}), encoding="utf-8")
    response_post = urllib.request.urlopen("http://httpbin.org/post",data = data)
    print(response_post.read().decode("utf-8"))
  3. urllib.request 超时

    1
    2
    3
    4
    5
    try:    #超时处理
    response = urllib.request.urlopen("http://www.baidu.com", timeout=1) #时间超过1分钟超时报错
    print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
    print("time out!")

    当前大多数网站均有反爬机制,比如通过User-Agent简单判断请求来源,因此简单使用以上方法均不适用(请求头直接告诉了网站自己是爬虫),下面也会用一个例子证明

  4. urllib.request.Request

    1
    2
    3
    urllib.request.Request(url,data,head,method)
    # url 必有参数 其他可选
    # warning: 函数默认method=get方法 post需添method=post

    下面比较一下使用含有User-Agent的header后的差异:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    #爬取豆瓣TOP250
    #仅使用urllib.request get
    response = urllib.request.urlopen("http://www.baidu.com")
    print(response.read().decode("utf-8"))

    #添加header后
    url = "https://movie.douban.com/top250"
    headers = {
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36"
    }
    req = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))

0x03 bs4.BeautifulSoup库-学习

  1. 官方文档

    先上官方文档,下载HTML示例作为后续操作使用文件

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    # 直接复制下面这段也行
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>

    <p class="story">...</p>
    """
  2. 常用字段

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    # 初始化BeautifulSoup对象
    soup = BeautifulSoup(html_doc, "html.parser")
    # 标准缩进结构输出
    print(soup.prettify())
    ----------------------------------
    # BeautifulSoup对象成员是tag: 例如 title、head
    soup.title
    # <title>The Dormouse's story</title>

    soup.title.name
    # u'title'

    # 获取标签内容
    soup.title.string
    # u'The Dormouse's story'

    # 获取子节点 的 父节点(其他相关成员按需查找文档
    soup.title.parent.name
    # u'head'

    soup.p
    # <p class="title"><b>The Dormouse's story</b></p>

    soup.p['class']
    # u'title'

    # 与findall区别,查找第一个<..>标签
    soup.a
    # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

    soup.find_all('a')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

    # 按 属性 查找 第一个
    soup.find(id="link3")
    # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

    # 获取全部文字内容
    print(soup.get_text())

    # 具体其他的需要时翻阅文档

0x04 re库-学习

  1. 正则表达式操作符

    image-20210206155912446

    image-20210206160111342

  2. re库函数

    image-20210206160136981

    1
    2
    3
    4
    warning:	使用相对较多(排序有先后)
    # re.findall
    # re.match
    # re.sub
  3. 正则匹配模式

    image-20210206160501217

    1
    2
    3
    warning:	使用相对较多
    # re.I
    # re.S
  4. 够用参考模板

    常用正则表达式

0x05 爬虫流程

  1. 准备工作

    • 确定想要爬取的内容,找到相应的网站
    • F12,阅读html文档和网络流量,观察得到所需的url和数据位置
  2. 获取数据

    通过urllib库获取html文档

  3. 解析数据

    html文档进行过滤,通过bs4.BeautifulSoup或者re正则

  4. 保存数据

    将数据进行储存(按需)

0x06 实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# encoding = utf-8
import urllib.request
from bs4 import BeautifulSoup
import re

def main():
baseURL = "https://movie.douban.com/top250?start="
# 1,2.爬取数据
datalist = getData(baseURL)
# 3.写入数据
f = open("./doubanTop250.txt","w",encoding="utf-8")
f.write(str(datalist))

def getData(baseUrl):
dataList = []
for i in range(0,10):
url = baseUrl + str(i*25)
# 1.获取数据
html = askURL(url)

# 2.解析数据
soup = BeautifulSoup(html)
for item in soup.find_all('div',class_ = "item"):
data = []
item = str(item)
link = re.findall(r'<a href="(.*?)">',item)[0]
data.append(link)
img = re.findall(r'<img.*src="(.*?)"',item)[0]
data.append(img)
titles = re.findall(r'<span class="title">(.*?)</span>',item)
data.append(titles[0])
title1 = " "
if len(titles)>1:
title1 = re.findall(r'<span class="title">(.*?)</span>',item)[1]
data.append(title1)
other = re.findall(r'<span class="other">(.*?)</span>',item)[0]
data.append(other)
p_ = re.compile(r'<p class="">(.*?)</p>',re.S)
p = re.findall(p_,item)[0]
data.append(p)
score = re.findall(r'<span.*average">(.*?)</span>',item)[0]
data.append(score)
num = re.findall(r'<span>(.*?)</span>',item)[0]
data.append(num)
inq = re.findall(r'<span class="inq">(.*?)</span>',item)
if(len(inq)>0):
data.append(inq[0])
dataList.append(data)
return dataList

def askURL(url):
header = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.146 Safari/537.36"
}

request = urllib.request.Request(url,headers=header)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
except urllib.error.URLError as e:
if(hasattr(e,"code")):
print(e.code)
if(hasattr(e,"reason")):
print(e.reason)


if __name__ == '__main__':
main()