import urllib.robotparser as urobot
import urllib.request
from bs4 import BeautifulSoup
url = "example.com"
rp = urobot.RobotFileParser()
rp.set_url(url + "/robots.txt")
rp.read()
if rp.can_fetch("*", url):
site = urllib.request.urlopen(url)
sauce = site.read()
soup = BeautifulSoup(sauce, "html.parser")
actual_url = site.geturl()[:site.geturl().rfind('/')]
my_list = soup.find_all("a", href=True)
for i in my_list:
# rather than != "#" you can control your list before loop over it
if i != "#":
newurl = str(actual_url)+"/"+str(i)
try:
if rp.can_fetch("*", newurl):
site = urllib.request.urlopen(newurl)
# do what you want on each authorized webpage
except:
pass
else:
print("cannot scrap")
import os
result = os.popen("curl https://fortune.com/robots.txt").read()
result_data_set = {"Disallowed":[], "Allowed":[]}
for line in result.split("\n"):
if line.startswith('Allow'): # this is for allowed url
result_data_set["Allowed"].append(line.split(': ')[1].split(' ')[0]) # to neglect the comments or other junk info
elif line.startswith('Disallow'): # this is for disallowed url
result_data_set["Disallowed"].append(line.split(': ')[1].split(' ')[0]) # to neglect the comments or other junk info
print (result_data_set)
3条答案
按热度按时间zujrkrfu1#
为什么你必须手动检查你的网址?您可以在Python 3中使用
urllib.robotparser
,并执行以下操作6tdlim6h2#
您可以使用
curl
命令将robots.txt文件读取为单个字符串,并使用允许和不允许URL的新行检查将其拆分。qyyhg6bp3#
实际上,RobotFileParser可以完成这项工作,请考虑以下代码
从my post on medium