我需要刮从rajya sabha网站的所有表数据.然而,而不是从url链接的代码刮原始表逐页
from selenium import webdriver
import chromedriver_binary
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import time
import lxml
url = 'https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start=0'
#url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={i}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table1 = soup.find('table', id='sam_table')
headers = []
for a in table1.find_all('th'):
title = a.text
headers.append(title)
rsdata = pd.DataFrame(columns = headers)
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False)
# Create a for loop to fill rajya sabha data
for k in range(0,96):
url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={k}"
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(rsdata)
rsdata.loc[length] = row
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False, header=False)
print(k)
# Export to csv
# Try to read csv
#rs_data = pd.read_csv('rs_debate_data.csv')
我试图在表格的辩论标题栏中只删除与关键词气候变化有关的行。
1条答案
按热度按时间wvt8vs2t1#
这个循环在原始
table1
结果上执行find_all()
,而不是在它刚刚获取的页面上...