我需要抓取一个存储在CSV文件中的URL列表并导出到另一个CSV文件中。我必须犯一些错误,因为我不能运行它。所以,如果有人能帮助我,我很感激。
我是Python的新手,也合并了一些代码,所以我有一些问题要找出问题在哪里。我混合了一个导入CSV的代码和另一个需要字符串搜索的代码。
import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
下面是我目前所拥有的代码。
phn_1 = []
zipcode_1 = []
err_msg_zipcode = []
err = []
class Spider:
name = 'spider'
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
s = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
df2=pd.DataFrame()
phn_1 = [] #store all the extracted Phn numbers in a List
mail_1 = [] #store all the extracted Zipcode in a List
for line in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r'\d{3}-\d{3}-\d{4}', s, re.MULTILINE)
zipcode = re.findall(r'(?<=, [A-Z]{2} )\d{5}', s, re.MULTILINE)
if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))
else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)
if len(zipcode) == 0:
print("No zipcode found.")
err_msg_zipcode = "No zipcode address found."
zipcode_1.append((url_name,err_msg_zipcode))
else:
count = 1
for item in zipcode:
mail_1.append((url_name,item))
count += 1
print(mail_1)
except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)
except urllib3.request.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)
df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()
df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates
df_m = pd.DataFrame(zipcode_1,columns=['URL','Zipcode']) # Dataframe for url and Zipcode
df_mail = df_m.drop_duplicates(subset=['URL','Zipcode'], keep='first') #remove duplicates
df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False, encoding='utf-8')
#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
谢谢你们!
1条答案
按热度按时间zyfwsgd61#
我看到的一个明显的错误是:
url
应该是一个字符串,但您发送的是一个列表。如果您要发送多个请求,则需要使用循环。由于您已经设置了start_urls
并使用了parse
回调,因此您不需要覆盖start_requests
。默认实现应该会处理它。您可能需要考虑在
__init__
方法中设置start_urls
。