python 爬虫基础

从站点获得数据

需要使用到 requests 这个库

requests 库

下载

1	pip install requests

使用方法

import requests

# 这个header是我的电脑的User-Agent
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}

# response = requests.get("目标网址", 其他条件(如自定义各个请求头的值))
response = requests.get(f"https://movie.douban.com/top250?start={start}&filter=", headers = headers)
# response是一个类，包含返回的各种数据

html = response.text
# html即为服务器返回的html文件的内容
# html也是一个类

对从站点获得的数据进行加工处理

需要使用 BeautifulSoup 函数，在库 bs4 中

bs4 库

下载

1	pip install bs4

使用方法

导入

1	from bs4 import BeautifulSoup

BeautifulSoup 函数能够把从上一步得到的 html 加工成一个树结构，方便后面的操作。

1 2	# 第二个参数用来指定解析器 soup = BeautifulSoup(html, "html.parser")

之后可以使用 findAll 函数进行具体内容的查找

# 第一个参数为查找的标签，第二个参数为限制条件
all_names = soup.findAll("span", attrs = {"class": "title"})

for name in all_names:
    if "/" not in str(name.string):
        id = id + 1
        f.write(str(id) + ": " + str(name.string) + '\n')

实例

获取豆瓣TOP250的中文名字

import requests
from bs4 import BeautifulSoup

with open("douban.txt", "w", encoding = "utf-8") as f:
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
    id = 0
    for start in range(0, 250, 25):
        response = requests.get(f"https://movie.douban.com/top250?start={start}&filter=", headers = headers)
        html = response.text

        soup = BeautifulSoup(html, "html.parser")
        all_names = soup.findAll("span", attrs = {"class": "title"})
        
        for name in all_names:
            if "/" not in str(name.string):
                id = id + 1
                f.write(str(id) + ": " + str(name.string) + '\n')