爬取百度电影排行和百度热搜和百度翻译的爬虫

发表于 2023-11-01 更新于 2023-11-18 Valine：

注意事项！！！！

不要把get和post方法写进一个死循环，封ip事小，警察叔叔找你喝茶事大。

百度热搜

import requests
from bs4 import BeautifulSoup
from colorama import Fore, init
'''
爬取数据
'''

n = 1
def write(file_name, data):
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(data)


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/58.0.3029.110 Safari/537.3",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}
url = "https://www.baidu.com/"
# response = requests.get(url)
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html_content = response.text

write('origin_data.html', html_content)
# print(soup.type)
# soup类型是none，猜测只是创建了一个实例对象并没有对数据进行整理赋值
soup = BeautifulSoup(html_content, 'html.parser')

# print(type(soup.prettify()))
# 这里的soup.prettify()的类型是str，说明这里对数据线进行了整理，并且是通过soup对象进行的prettify操作。
write('soup_data.html', soup.prettify())

# 注意要str类型才能写入文件
write('headers', str(response.headers))

'''
对爬取数据的分析
'''

'''
def open_data(file_name):
    with open(file_name, 'r', encoding="utf-8") as file:
        pre = file.read()
    return pre

# print(open_data('soup_data.html'))

 pre = open_data('soup_data.html')
 pro = pre.find('span',{'clsdd': 'title-content-title'}).text
 print(pro)
 必须在实例对象下对数据进行处理，不能直接将数据拿来直接分析
'''

'''
 print(soup.find('span', {'class': 'title-content-title'}))
 output = soup.find('span', {'class': 'title-content-title'}).text #找到文件的第一个内容，并将这个值的text格式赋值于output
    
'''

outputs = soup.find_all('span', {'class': 'title-content-title'})
# print(type(outputs))

# 将得到的内容打印出来
#
#     print(output.text)
with open("output.txt", "w") as file:
    pass
for output in outputs:
    with open("output.txt", "a", encoding="utf-8") as file:
        file.write(str(n) + ',' + output.text + "\n")
    n = n+1
print(Fore.RED + 'The progress has successfully done')

运行结果

百度电影排行

import requests
from bs4 import BeautifulSoup

n = 1
url = 'https://top.baidu.com/board?tab=movie'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/58.0.3029.110 Safari/537.3",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

repo = requests.get(url, headers=headers)
rest = repo.text
soup = BeautifulSoup(rest, 'html.parser')
outputs = soup.find_all('div', {'c-single-text-ellipsis'})
# print(outputs[0].text)
for output in outputs:
    if n % 2 == 1:
        with open("output.txt", "a", encoding="utf-8") as file:
            file.write(str(n) + ',' + output.text + "\n")
    n += 1
# print(outputs)
# with open('output.html', 'w', encoding='utf-8') as file:
#     file.write(soup.prettify())

运行结果

百度翻译

import requests


def write(file_name, data):
    with open(file_name, "a", encoding="utf-8") as file:
        file.write(data)


url = 'https://fanyi.baidu.com/sug'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/58.0.3029.110 Safari/537.3",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

word = input('请输入你想查询的单词')

data = {
    'kw': word
}
print(type(data))
repo = requests.post(url, data=data, headers=headers)
result = repo.json()
print(result['data'][0]['v'])
write('历史.txt', str(repo.json())  + '\n')
write('历史记录.txt', result['data'][0]['v'] + '\n')

运行结果