python 如何循环通过多个不同的网址,而网页抓取?

hmmo2u0o  于 2023-05-16  发布在  Python
关注(0)|答案(2)|浏览(175)

我花了一整天的时间在互联网上调查我的问题,但我找不到任何解决方案,所以,因此,我向你提出我的问题:
我成功地编写了一个代码来抓取特定信息的特定URL。现在,我想增强代码,以抓取相同信息的多个URL。

  • 网站URL在文本结尾处相差一个单词。我写了一个代码来建立一个URL列表。对于这个例子,我将其缩减为两个网站(changingignwordinurl)。
  • 我想把这两个网站的“名字”、“摸球”和“黄牌”的数据刮出来
- the name can be found on both websites
 - the amount of balls touched can be found on (changingignwordinurl = ballkontakte) and 
 - the amount of yellow cards can be found on the second website (changingignwordinurl = fairplay)
  • 我的目标是列出所有球员的名字和他们触球的次数,下面是所有吃到黄牌的球员和黄牌的数量

你们专业人士中有谁能指出我的错误吗?非常感谢您的帮助:)。

from bs4 import BeautifulSoup
import requests
import pandas as pd

urllist = []
changingignwordinurl = ["-ballkontakte/","-fairplay/",]
rooturl = "https://sportdaten.spiegel.de/fussball/bundesliga/ma9417803/fc-augsburg_eintracht-frankfurt/spielstatistik"

for word in changingignwordinurl:
    urllist.append(rooturl+word)

playerdata = []

for url in urllist:
    def get_data(url):
        response = requests.get(urllist)            
        soup = BeautifulSoup(response.content,"lxml")

        players = soup.find("table", 
                class_="module-statistics statistics")

        for player in players:
            item={}

            name = player.find("td", class_="person-name")
            ballstouchedtotal = player.find("td", class_="person_stats-balls_touched person_stats-balls_touched-list")
            yellowcards = player.find("td", class_= "person_stats-card_yellow person_stats-card_yellow-list")

            item["Name"] = name.text.strip() if name else ""
            item["Balls touched"] = ballstouchedtotal.text.strip() if ballstouchedtotal else ""
            item["Yellow Card"] = yellowcards.text.strip() if yellowcards else ""
                
            data.append(item)

        return playerdata 

print(playerdata)
tyky79it

tyky79it1#

您已经定义了函数get_data(),但它没有被调用。考虑将函数定义移出循环,并将其替换为函数调用get_data(url)
试试类似的东西:

from bs4 import BeautifulSoup
import requests

def get_data(url):
    response = requests.get(url)            
    soup = BeautifulSoup(response.content, "html.parser") ## "xlml" also valid if you install it
    players = soup.find("table", class_="module-statistics statistics")
    this_url_player_data = []
    for player in players:
        item={}

        name = player.find("td", class_="person-name")
        ballstouchedtotal = player.find("td", class_="person_stats-balls_touched person_stats-balls_touched-list")
        yellowcards = player.find("td", class_= "person_stats-card_yellow person_stats-card_yellow-list")

        item["Name"] = name.text.strip() if name else ""
        item["Balls touched"] = ballstouchedtotal.text.strip() if ballstouchedtotal else ""
        item["Yellow Card"] = yellowcards.text.strip() if yellowcards else ""
            
        this_url_player_data.append(item)
    return this_url_player_data 

rooturl = "https://sportdaten.spiegel.de/fussball/bundesliga/ma9417803/fc-augsburg_eintracht-frankfurt/spielstatistik"
changingignwordinurl = ["-ballkontakte/","-fairplay/",]
urllist = []
for word in changingignwordinurl:
    urllist.append(rooturl+word)

playerdata = []
for url in urllist:
    playerdata += get_data(url)

print(playerdata)
rseugnpd

rseugnpd2#

这是我的解决方案。

from bs4 import BeautifulSoup
import requests

def get_data(url):
    data = []
    response = requests.get(url)
    
    if(not response.ok):
        print(f"Conection to {url} FAIL")
        return data

    print(f"Conection to {url} OK")
    soup = BeautifulSoup(response.text,"lxml")

    players = soup.find("table",
            class_="module-statistics statistics")
    
    for player in players:
        item={}

        name = player.find("td", class_="person-name")
        ballstouchedtotal = player.find("td", class_="person_stats-balls_touched person_stats-balls_touched-list")
        yellowcards = player.find("td", class_= "person_stats-card_yellow person_stats-card_yellow-list")

        item["Name"] = name.text.strip() if name else ""
        item["Balls touched"] = ballstouchedtotal.text.strip() if ballstouchedtotal else ""
        item["Yellow Card"] = yellowcards.text.strip() if yellowcards else ""

        data.append(item)
    
    return data

urllist = []
changingignwordinurl = ["-ballkontakte/","-fairplay/",]
rooturl = "https://sportdaten.spiegel.de/fussball/bundesliga/ma9417803/fc-augsburg_eintracht-frankfurt/spielstatistik"

# Get URLs
for word in changingignwordinurl:
    urllist.append(rooturl+word)

playerdata = []

# Get data for each URL
for url in urllist:
    playerdata.append(get_data(url))

print(playerdata)

相关问题