Python webscraper,遍历对象列表并动态更改URL

qni6mghb  于 2023-02-28  发布在  Python
关注(0)|答案(2)|浏览(173)

我有这个代码,从一个URL获取数据:

import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
url = 'https://liquipedia.net/counterstrike/S1mple'
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'html.parser')

list = ['M0NESY','S1mple']

listdf = pd.DataFrame(list)
listdf.index = listdf.index + 1
print(listdf)

stammdaten = dict( (e.text, e.find_next_sibling('div').text) for e in soup.select('.infobox-description'))
stammdaten['nickname'] = url.split('/')[-1]

#creating a dataframe
index = [1]
stammdatendf = pd.DataFrame(stammdaten, columns = ['nickname', 'Role:','Born:'], index=index)
print(stammdatendf)

#exporting data into Excel
stammdatendf.to_excel('test.xlsx')

它适用于代码中预定义的静态URL,但是我希望代码循环list中的项目,基于此动态更改URL,并在我的 Dataframe 中输出nicknameRole:Born:的结果。
不幸的是,我有点不知道如何实现它。我知道我应该创建一个循环命令,但我尝试这样做失败了。
在此编辑:
基本上我想达到的目标是这样的:

import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
urls = ['https://liquipedia.net/counterstrike/S1mple', 'https://liquipedia.net/counterstrike/M0NESY', 'https://liquipedia.net/counterstrike/Dexter']
players = []
for url in urls:
    soup = BeautifulSoup(requests.get(url, headers=headers).text, 'html.parser')
    stammdaten = dict( (e.text, e.find_next_sibling('div').text) for e in soup.select('.infobox-description'))
    stammdaten['nickname'] = url.split('/')[-1]
    players.append(stammdaten)
    #print(players)

#creating a player DataFrame
playerdf = pd.DataFrame(players, columns =['nickname','Born:','Role:'])
print(playerdf)

但是,我不想使用urls数组,而是使用一个包含所有玩家名称的现有数组。

zzlelutf

zzlelutf1#

我相信你已经了解了漂亮的汤的基本用法,但是你对如何创建爬虫缺乏结构化的直觉。假设你知道函数、if语句和循环是如何工作的,我已经编写了一些经过修改后应该可以工作的伪代码。查找显示TODO的部分并编辑它们。

import pandas as pd

visited_urls = []
urls_to_visit = []
forbidden_urls = []# list of urls that would lead your crawler into places that are NOT relevant. Like home page. About us page, etc

COLUMNS = [] # TODO: Find column names will exist in all pages that you're scraping

def get_urls_in_url(url):
    list_of_urls_for_page = []
    """TODO: Get Beautiful soup to generate a list of urls. 
    Additionally make sure you filter out all the urls that are not importan here
    The goal of this function is to exstract only those URL's that will provide you the information you need
    """
    return list_of_urls_for_page
  

def get_the_data_you_need_from_page(url):
    df = df.DataFrame(COLUMNS)
    """TODO: Scrape the data. Put it in the columns you need. If the data doesn't exist. Then return an empty data frame"""
    return df

def crawl(initial_url):
    urls_to_visit.append(initial_url)
    
    all_data = pd.DataFrame(COLUMNS)
    
    while True:        
        if len(urls_to_visit) == 0:
            break
        
        url_of_interest = urls_to_visit.pop(0) # takes the first url off the list]
        visited_urls.append(url_of_interest) #mark this url as visited
        
        print(f"Visiting {url_of_interest}...")
        
        # Deal with Data on that page
        print(f"Extracting Data from {url_of_interest}")
        page_data = get_the_data_you_need_from_page(url = url_of_interest)
        all_data.append(page_data, ignore_index = True)
        
        # Deal with URL on that page
        print(f"Extracting URLs from {url_of_interest}")
        page_urls = get_urls_in_url(url=url_of_interest)
        
        for url in page_urls:
            if url not in visited_urls and url not in forbidden_urls:
                print(f"Adding {url} to urls worth visiting")
                urls_to_visit.append(url)
            else:
                print(f"Ignoring: {url}")

    #if you've run out of urls to visit, you'll reach this part of the crawl() funtion
    return all_data  

if __name__ == "__main__":
    target_url = "https://liquipedia.net/counterstrike/S1mple"
    
    data = crawl(initial_url= target_url) #crawls, builds up a dataframe of information. 
    
    data.to_csv("text.csv")

程序从if __name__ == "__main__":开始
它调用crawl()来抓取网址。

shstlldc

shstlldc2#

我想你是想弄清楚如何浏览liquidpedia使用美丽的汤刮球员的个人资料。
在这个响应中,我将为M0NESY的配置文件页面https://liquipedia.net/counterstrike/M0NESY创建一个scrape配置文件

检查页面结构

我的第一步是检查所提供的静态文档的结构。

然后点击inspect。2开发者工具应该打开了。3你检查并理解HTML页面的结构和它的设计来构建定制的scraper。
看来这个开发商并没有让我们太容易。

通常,它们会包含一个idclass标记,以便于查找。
它们甚至不使用table元素,只是每一行都有一个类似这样的重复结构。

创建策略以提取特定项目

我的方法是:

  • 查找全部divs
  • 选择包含Name:的div
  • 抓住家长
  • 第一个子元素是Name:,从包含Илья Осипов的父元素中获取第二个子元素

它看起来像这样:

def extract_player_info_from_liquipedia_bs4(content:bs4.BeautifulSoup, descriptor):
    """
    There are no specific tags or classes for that data. Kind of annoying. 
    The structure for that profile table seems to be
     
    div:
        div with a description like "Name"
        div with the details    

    """    
    # find all divs
    divs = content.select("div")
    
    # focus on the one that has the word "Name:" in it
    info_divs = list(filter(lambda x: descriptor in x, divs))  #this sorts through all divs and only returns a list of the divs with the word "Name:" in them.
    
    # find its parent 
    parent = info_divs[0].parent
    
    #parent will have two children, The description and the value
    second_child_with_details = parent.select("div")[1]
    
    return second_child_with_details.contents[0]

输出的质量会有很小的变化。
例如:

  • 如果我在Name:上运行它,它将得到Олександр Олегович Костилєв
  • 如果我在Nationality上运行它,它将得到类似"<span class="flag"><a href="/counterstrike/Category:Ukraine" title="Ukraine"><img alt="Ukraine" decoding="async" height="24" loading="lazy" src="/commons/images/b/bb/Ua_hd.png" width="36"/></a></span>"的结果

后者需要一些改进,以使其可读/文本可读。
对于特定的字段,我创建特定的函数来清除它们。

def clean_nationality_description(nationality_description):
    """
    Turns:
    "<span class="flag"><a href="/counterstrike/Category:Ukraine" title="Ukraine"><img alt="Ukraine" decoding="async" height="24" loading="lazy" src="/commons/images/b/bb/Ua_hd.png" width="36"/></a></span>"
    into
    "Ukraine"    
    """
    nationality = nationality_description.a.get("title")
    return nationality

创建在配置文件上循环的脚本

为每个配置文件创建一个数据框,然后将其添加到一个更大的数据框中。一旦你运行完所有的播放器,它将保存到一个csv文件中,在excel中打开应该很好。

import bs4  

import requests
import pandas as pd

def get_page_content(url):
    soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
    return soup

def extract_player_info_from_liquipedia_bs4(content:bs4.BeautifulSoup, descriptor):
    """
    There are no sepcific tags or classes for that data. Kind of annoying. 
    The structure for that profile table seems to be
     
    div:
        div with the description like "Name"
        div with the details    

    """    
    # find all divs
    divs = content.select("div")
    
    # focus on the one that has the word "Name:" in it
    info_divs = list(filter(lambda x: descriptor in x, divs))  #this sorts through all divs and only returns a list of the divs with the word "Name:" in them.
    
    # find its parent to find the actual name.
    parent = info_divs[0].parent
    
    #parent will have tow children, The description and the val ue
    second_child_with_details = parent.select("div")[1]
    
    return second_child_with_details.contents[0]

def clean_born_description(born_description):
    """ 
    Turns:
    "May 1, 2005 (age 17)"
    into:
    "May 1, 2005"
    """
    
    born = born_description.split("(")[0].strip()
    return born

def clean_nationality_description(nationality_description):
    """
    Turns:
    "<span class="flag"><a href="/counterstrike/Category:Ukraine" title="Ukraine"><img alt="Ukraine" decoding="async" height="24" loading="lazy" src="/commons/images/b/bb/Ua_hd.png" width="36"/></a></span>"
    into
    "Ukraine"    
    """
    nationality = nationality_description.a.get("title")
    return nationality


players = ["M0NESY", "S1mple"]
player_tables = pd.DataFrame([], columns = ["handle", "name", "romanized","born", "nationality"])

for loop_index, player_name in enumerate(players): 
    url = 'https://liquipedia.net/counterstrike/' + player_name
    content = get_page_content(url)

    name                      = extract_player_info_from_liquipedia_bs4(content=content, descriptor = "Name:")
    romanized                 = extract_player_info_from_liquipedia_bs4(content=content, descriptor = "Romanized Name:")

    __born_description        = extract_player_info_from_liquipedia_bs4(content=content, descriptor= "Born:")
    __nationality_description = extract_player_info_from_liquipedia_bs4(content=content, descriptor= "Nationality:")

    born        = clean_born_description(__born_description)
    nationality = clean_nationality_description(__nationality_description)

    # build a dataframe
    data = dict( name = [name],
                 romanized = [romanized],
                 born = [born],
                 nationality = [nationality], 
                 handle = [player_name])

    player_info = pd.DataFrame(data, index = [loop_index])
    
    # concat it with our table
    player_tables = pd.concat( [player_tables, player_info]) 
    

player_tables.to_csv("Players.csv")

相关问题