在python中抓取一个未按预期显示正确结果的表

f0ofjuux  于 2022-12-25  发布在  Python
关注(0)|答案(1)|浏览(110)

我需要刮从rajya sabha网站的所有表数据.然而,而不是从url链接的代码刮原始表逐页

from selenium import webdriver
import chromedriver_binary
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import time
import lxml


url = 'https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start=0'

#url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={i}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

table1 = soup.find('table', id='sam_table')
headers = []
for a in table1.find_all('th'):
    title = a.text
    headers.append(title)
    rsdata = pd.DataFrame(columns = headers)
    rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False)
    
# Create a for loop to fill rajya sabha data
for k in range(0,96):
    url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={k}"
    page = requests.get(url_call)
    for j in table1.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [i.text for i in row_data]
        length = len(rsdata)
        rsdata.loc[length] = row
        rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False, header=False)
print(k)

# Export to csv

# Try to read csv
#rs_data = pd.read_csv('rs_debate_data.csv')

我试图在表格的辩论标题栏中只删除与关键词气候变化有关的行。

wvt8vs2t

wvt8vs2t1#

for k in range(0,96):
    url_call = "..."
    page = requests.get(url_call)
    for j in table1.find_all('tr')[1:]:

这个循环在原始table1结果上执行find_all(),而不是在它刚刚获取的页面上...

相关问题