爬取百度电影排行和百度热搜和百度翻译的爬虫

注意事项!!!!

不要把get和post方法写进一个死循环,封ip事小,警察叔叔找你喝茶事大。

百度热搜

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
from bs4 import BeautifulSoup
from colorama import Fore, init
'''
爬取数据
'''

n = 1
def write(file_name, data):
with open(file_name, "w", encoding="utf-8") as file:
file.write(data)


headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}
url = "https://www.baidu.com/"
# response = requests.get(url)
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html_content = response.text

write('origin_data.html', html_content)
# print(soup.type)
# soup类型是none,猜测只是创建了一个实例对象并没有对数据进行整理赋值
soup = BeautifulSoup(html_content, 'html.parser')

# print(type(soup.prettify()))
# 这里的soup.prettify()的类型是str,说明这里对数据线进行了整理,并且是通过soup对象进行的prettify操作。
write('soup_data.html', soup.prettify())

# 注意要str类型才能写入文件
write('headers', str(response.headers))

'''
对爬取数据的分析
'''

'''
def open_data(file_name):
with open(file_name, 'r', encoding="utf-8") as file:
pre = file.read()
return pre

# print(open_data('soup_data.html'))

pre = open_data('soup_data.html')
pro = pre.find('span',{'clsdd': 'title-content-title'}).text
print(pro)
必须在实例对象下对数据进行处理,不能直接将数据拿来直接分析
'''

'''
print(soup.find('span', {'class': 'title-content-title'}))
output = soup.find('span', {'class': 'title-content-title'}).text #找到文件的第一个内容,并将这个值的text格式赋值于output

'''

outputs = soup.find_all('span', {'class': 'title-content-title'})
# print(type(outputs))

# 将得到的内容打印出来
#
# print(output.text)
with open("output.txt", "w") as file:
pass
for output in outputs:
with open("output.txt", "a", encoding="utf-8") as file:
file.write(str(n) + ',' + output.text + "\n")
n = n+1
print(Fore.RED + 'The progress has successfully done')


运行结果
image-20231117232723115

百度电影排行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests
from bs4 import BeautifulSoup

n = 1
url = 'https://top.baidu.com/board?tab=movie'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

repo = requests.get(url, headers=headers)
rest = repo.text
soup = BeautifulSoup(rest, 'html.parser')
outputs = soup.find_all('div', {'c-single-text-ellipsis'})
# print(outputs[0].text)
for output in outputs:
if n % 2 == 1:
with open("output.txt", "a", encoding="utf-8") as file:
file.write(str(n) + ',' + output.text + "\n")
n += 1
# print(outputs)
# with open('output.html', 'w', encoding='utf-8') as file:
# file.write(soup.prettify())

运行结果
image-20231117232753644

百度翻译

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import requests


def write(file_name, data):
with open(file_name, "a", encoding="utf-8") as file:
file.write(data)


url = 'https://fanyi.baidu.com/sug'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

word = input('请输入你想查询的单词')

data = {
'kw': word
}
print(type(data))
repo = requests.post(url, data=data, headers=headers)
result = repo.json()
print(result['data'][0]['v'])
write('历史.txt', str(repo.json()) + '\n')
write('历史记录.txt', result['data'][0]['v'] + '\n')

运行结果

image-20231117233002664