抓取市场数据

6jjcrrmo  于 2021-08-20  发布在  Java
关注(0)|答案(1)|浏览(344)

我试图使用selenium和django从dex工具中获取市场数据,但迄今为止,我无法获取所有的市场数据。你会注意到dex工具市场数据处于延迟加载状态,这意味着一旦你向下滚动或按下下一页,就会加载新数据,因为所有数据都不能显示在一个网页中,所以数据会被分成35页。目前我只能抓取屏幕上出现的第一页,下面的代码并不能抓取所有数据。我如何改进代码以刮取所有网页数据值

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

def getData(url):
    driver = webdriver.Chrome(
        executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
        )
    driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')

    # get table
    tableElement = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
    )
    # scroll into table view
    driver.execute_script("arguments[0].scrollIntoView();", tableElement)

    # scrolling through the table body to the bottom
    tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)

    rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')

    for row in rowWrapper:
        cells = row.find_elements_by_tag_name('datatable-body-cell')
        date = cells[0].text
        type = cells[1].text
        price_usd = cells[2].text
        price_eth = cells[3].text
        ammount_cuminu = cells[4].text
        total_eth = cells[5].text
        maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
        print(date, type, price_usd, price_eth, ammount_cuminu, total_eth, maker)
        print('----')

这是上述代码第1页刮取数据的结果

epggiuax

epggiuax1#

只需输入代码即可 while True 循环并单击 next 在这个循环的末尾。你可以用 try/except 当没有更多错误时捕捉错误 next 要退出循环的页面。
最终它可能需要 sleep() 之后 click() 所以javascript将有时间替换现有表中的值 ngx-datatable .
编辑:现在使用代码 pandas.DataFrame 将所有内容保存在excel文件中。

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

def getData(url):

    driver = webdriver.Chrome(
        executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
        )

    #driver = webdriver.Chrome()
    #driver = webdriver.Firefox()

    driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')

    page = 0

    all_results = []  # list for all rows

    while True:

        page += 1
        print('--- page:', page, '---')

        # get table
        tableElement = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
        )
        # scroll into table view
        driver.execute_script("arguments[0].scrollIntoView();", tableElement)

        # scrolling through the table body to the bottom
        tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
        driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)

        rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')

        for row in rowWrapper:
            cells = row.find_elements_by_tag_name('datatable-body-cell')
            date = cells[0].text
            type = cells[1].text
            price_usd = cells[2].text
            price_eth = cells[3].text
            ammount_cuminu = cells[4].text
            total_eth = cells[5].text
            maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
            print(date, type, price_usd, price_eth, ammount_cuminu, total_eth, maker)
            print('----')

            # add row to list
            all_results.append( [date, type, price_usd, price_eth, ammount_cuminu, total_eth, maker] )

        try:
            next_page = driver.find_element_by_xpath('//a[@aria-label="go to next page"]')
            next_page.click()
            time.sleep(0.5)
        except Exception as ex:
            print("last page???")
            break

    # after loop convert to DataFrame and write it to excel

    import pandas as pd

    df = pd.DataFrame(all_results, columns=['date', 'type', 'price_usd', 'price_eth', 'ammount_cuminu', 'total_eth', 'maker'])
    df.to_excel('results.xlsx')

# ---

getData(None)

相关问题