编程学习提问回答-前沿社区

新手，用python爬取财富网中国500强数据

# 导入正则表达式 import re import time # 导入requests库 import requests # 导入BeautifulSoup库 from bs4 import BeautifulSoup # 请求的URL路径，2020年财富世界500强排行榜 url = "http://www.fortunechina.com/fortune500/c/2020-08/10/content_372148.htm" # 发送GET请求，返回一个响应对象 response = requests.get(url) # 响应内容编码方式 print("response.encoding = ", response.encoding) # 响应内容编码方式修改为'utf-8' response.encoding = 'utf-8' # 响应内容编码方式 print("response.encoding = ", response.encoding) # HTTP响应内容的字符串形式，即URL对应的页面内容 text = response.text print("text = ", text) # 使用html5lib解析器，以浏览器的方式解析文档，生成HTML5格式的文档 soap = BeautifulSoup(text, "html5lib") print("soap.text = ", soap) # 查找符合查询条件的第一个标签节点(tbody) top500 = soap.find('tbody') # 以可写方式打开csv文件 # file1 = open('top500.csv', 'w', encoding='utf-8-sig' file1 = open('top500.csv', 'w', encoding='utf-8') # 用于测试编码错误的例外 # file1 = open('top500.csv','w') newLine = '' i = 0 # 传入正则表达式，通过正则表达式re模块的compile()函数进行匹配 # 找到标签为"td"的信息：全部2020年财富全球500强数据含标签 for tag in top500.findAll(re.compile("^td")): # print(" tag.string = ", tag.string i += 1 # 纵向2020年财富全球500强数据，1个公司数据合并一行中..., 以"|"分隔 newLine = newLine + tag.string + "|" # print("newLine = ", newLine) if i == 6: # 2020年财富全球500强数据，1行为1个公司数据，以"|"分隔 print("newLine = ", newLine) # 单行写入文件 file1.writelines(newLine + "n") i = 0 newLine = '' # 关闭文件 file1.close()

0 条评论

新手，用python爬取财富网中国500强数据

0 个回答

相似问题