我是新的scrapy和django集成,但我正在尝试一些简单的事情,让事情在我的职业生涯中与这两个。本质上,我想抓住标题从一个网站,模型将阅读这和意见将上传这一点到一个基本的html模板。
但是,当我运行scrapy crawl test
时,我得到了这个错误
django.core.exceptions.ImproperlyConfigured:已请求设置LOGGING_CONFIG,但未配置设置。在访问设置之前,必须定义环境变量DJANGO_SETTINGS_MODULE或调用settings.configure()。
这是我的树:
── cruise_control
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── admin.cpython-38.pyc
│ ├── apps.cpython-38.pyc
│ ├── models.cpython-38.pyc
│ ├── urls.cpython-38.pyc
│ └── views.cpython-38.pyc
├── admin.py
├── apps.py
├── migrations
│ ├── 0001_initial.py
│ ├── __init__.py
│ └── __pycache__
│ ├── 0001_initial.cpython-38.pyc
│ └── __init__.cpython-38.pyc
├── models.py
├── templates
│ └── cruise_control
│ └── basic.html
├── tests.py
├── urls.py
└── views.py
── cruises
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── settings.cpython-38.pyc
│ └── urls.cpython-38.pyc
├── asgi.py
├── scraper
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-38.pyc
│ │ └── settings.cpython-38.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── test.py
├── scrapy.cfg
├── settings.py
├── urls.py
└── wsgi.py
以下是其中的一些片段:models.py
from django.db import models
class Cruises(models.Model):
title = models.CharField(max_length=200)
views.py
from django.shortcuts import render
from .models import Cruises
def basic(request):
long_list = Cruises.objects.values('title')
return render(request, 'cruise_control/basic.html', context = {'long_list':long_list})
urls.py:
from django.urls import path
from . import views
urlpatterns = [
path('',views.basic, name = 'basic')
]
刮刀:
import scrapy
from scrapy.http import JsonRequest
from scraper.items import ScraperItem
from scrapy.spiders import CrawlSpider
headers = { 'authority': 'www.tripadvisor.co.uk',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'x-requested-by': 'TNI1625!AG1YRRpHOjQMgbfsrg1FWY4Ai8UH+StE3D7tD1/oCg3qzWRAYM2ff14YfUM2JUbFAl0x6vTP5McIcIHK3vGsWp/OUNzOT5pIGiZKb0BGLlQkrHttvrrkMiEX1B08Oy4WjTHFseLIh9VcHJi4Gh0/+LjAQFKarv7VPh3A6Lba2SV/',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'accept': '*/*',
'origin': 'https://www.tripadvisor.co.uk',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.tripadvisor.co.uk/Cruises-g4-Europe-Cruises',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
#'cookie': 'TADCID=wxS4TbuLpUspXUnWABQCFdpBzzOuRA-9xvCxaMyI12uHBUXU8sLyLHDaoIuwxzQKyBrTFlgsk84ZsL_itZEhwu8hHz-VItOKi2w; TAUnique=%1%enc%3A8kwUAflygK31tlwOhg%2Fo76dh9wxu05Ut4MznwnYlPlg2jHwltRJPGQ%3D%3D; TASSK=enc%3AAPiLXN0t%2B8Q%2Fy9%2FxR%2BD5555CPTdegwxcaa1ok4l9U33f3IyY6Qg8GN25OIJ4ccUZntma4TTL9a%2Bl%2BoIz%2FTAucOZ2TqYV6tkQbMAYMyq1l5ArmPX7CjgQq2QO%2B9HE%2BLVGaw%3D%3D; ServerPool=X; PMC=V2*MS.18*MD.20220214*LD.20220214; TART=%1%enc%3A9bZcDoYP6O8GE%2BreSp1djAImFcYdHhqBzveZGrQkjMRhW3dIKV4FZ%2FdZWju6gRL2CeyaC1LeImE%3D; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RS.1; TASID=A32BBADDF2344464B10CA8620CFBF2AF; TAReturnTo=%1%%2FCruises-g4-Europe-Cruises; ak_bmsc=D33D58B4C6C12D1D96C6D50E4A8267F6~000000000000000000000000000000~YAAQ3Jl6XLHYnul+AQAAegxz9w6zDYaRpZULIXbmwExafFXbLk88He8U5RsFRJJHYrPKRs60IK77pXrkBd1Bl7bvDGDEhZqKkbtOP/6nqDF1R4eUq2ZuIfReBoo+S9nxAuR2rla11JjDVD65qUN1aH0uichlgPClLxslcNh3JclKJzPv3kg7aDgrvT2CaDQ5f5zz2UPkb+EOkAEyOPwhg8exOgHhsbD2BhGqL7PAOPfZPVuocBXutZBOcDBrsy1rZlHC79MQQdX5szmK9zwQnZUVDvmln+DUVXXyN835bImRRSbTNz12EDee2RgtZwmuQNv+eSXnS3gJHBkTErdp7jEdbbCytqTdI2Ix8OR8QzmJnUAXL0dOvpqmUmkGFnWxUz68QxHkf7hC91Pqt3CJ2A==; OptanonAlertBoxClosed=2022-02-14T08:57:03.130Z; eupubconsent-v2=CPUZP2aPUZP23AcABBENCCCsAP_AAH_AACiQIltf_X__b3_j-_5_f_t0eY1P9_7_v-0zjhfdt-8N3f_X_L8X42M7vF36pq4KuR4Eu3LBIQdlHOHcTUmw6okVrzPsbk2cr7NKJ7PEmnMbO2dYGH9_n93TuZKY7______z_v-v_v____f_7-3_3__5_3---_e_V_99zLv9____39nP___9v-_9_____4IhgEmGpeQBdmWODJtGlUKIEYVhIdAKACigGFoisIHVwU7K4CfUELABCagJwIgQYgowYBAAIJAEhEQEgB4IBEARAIAAQAqQEIACNgEFgBYGAQACgGhYgRQBCBIQZHBUcpgQFSLRQT2ViCUHexphCGWeBFAo_oqEBGs0QLAyEhYOY4AkBLxZIHmKF8gAAAAA.f_gAD_gAAAAA; OTAdditionalConsentString=1~39.43.46.55.61.66.70.83.89.93.108.117.122.124.131.135.136.143.144.147.149.159.162.167.171.192.196.202.211.218.228.230.239.241.259.266.272.286.291.311.317.322.323.326.327.338.367.371.385.389.394.397.407.413.415.424.430.436.440.445.449.453.482.486.491.494.495.501.503.505.522.523.540.550.559.560.568.574.576.584.587.591.733.737.745.780.787.802.803.817.820.821.829.839.864.867.874.899.904.922.931.938.979.981.985.1003.1024.1027.1031.1033.1034.1040.1046.1051.1053.1067.1085.1092.1095.1097.1099.1107.1127.1135.1143.1149.1152.1162.1166.1186.1188.1201.1205.1211.1215.1226.1227.1230.1252.1268.1270.1276.1284.1286.1290.1301.1307.1312.1345.1356.1364.1365.1375.1403.1415.1416.1419.1440.1442.1449.1455.1456.1465.1495.1512.1516.1525.1540.1548.1555.1558.1564.1570.1577.1579.1583.1584.1591.1603.1616.1638.1651.1653.1665.1667.1677.1678.1682.1697.1699.1703.1712.1716.1721.1722.1725.1732.1745.1750.1765.1769.1782.1786.1800.1808.1810.1825.1827.1832.1837.1838.1840.1842.1843.1845.1859.1866.1870.1878.1880.1889.1899.1917.1929.1942.1944.1962.1963.1964.1967.1968.1969.1978.2003.2007.2008.2027.2035.2039.2044.2046.2047.2052.2056.2064.2068.2070.2072.2074.2088.2090.2103.2107.2109.2115.2124.2130.2133.2137.2140.2145.2147.2150.2156.2166.2177.2183.2186.2202.2205.2216.2219.2220.2222.2225.2234.2253.2264.2279.2282.2292.2299.2305.2309.2312.2316.2322.2325.2328.2331.2334.2335.2336.2337.2343.2354.2357.2358.2359.2366.2370.2376.2377.2387.2392.2394.2400.2403.2405.2407.2411.2414.2416.2418.2425.2427.2440.2447.2459.2461.2462.2465.2468.2472.2477.2481.2484.2486.2488.2492.2493.2496.2497.2498.2499.2501.2510.2511.2517.2526.2527.2532.2534.2535.2542.2544.2552.2563.2564.2567.2568.2569.2571.2572.2575.2577.2583.2584.2589.2595.2596.2601.2604.2605.2608.2609.2610.2612.2614.2621.2628.2629.2633.2634.2636.2642.2643.2645.2646.2647.2650.2651.2652.2656.2657.2658.2660.2661.2669.2670.2677.2681.2684.2686.2687.2690.2695.2698.2707.2713.2714.2729.2739.2767.2768.2770.2772.2784.2787.2791.2792.2798.2801.2805.2812.2813.2816.2817.2818.2821.2822.2827.2830.2831.2834.2836.2838.2839.2840.2844.2846.2847.2849.2850.2851.2852.2854.2856.2860.2862.2863.2865.2867.2869.2873.2874.2875.2876.2878.2879.2880.2881.2882.2883.2884.2886.2887.2888.2889.2891.2893.2894.2895.2897.2898.2900.2901.2908.2909.2911.2912.2913.2914.2916.2917.2918.2919.2920.2922.2923.2924.2927.2929.2930.2931.2939.2940.2941.2942.2947.2949.2950.2956.2961.2962.2963.2964.2965.2966.2968.2970.2973.2974.2975.2979.2980.2981.2983.2985.2986.2987.2991.2993.2994.2995.2997.2999.3000.3002.3003.3005.3008.3009.3010.3012.3016.3017.3018.3019.3024.3025.3028.3034.3037.3038.3043.3044.3045.3048.3052.3053.3055.3058.3059.3063.3065.3066.3068.3070.3072.3073.3074.3075.3076.3077.3078.3089.3090.3093.3094.3095.3097.3099.3100.3104.3106.3109.3111.3112.3116.3117.3118.3119.3120.3124.3126.3127.3128.3130.3135.3136.3145.3149.3150.3151.3154.3155.3162.3163.3167.3172.3173.3180.3182.3183.3184.3185.3187.3188.3189.3190.3194.3196.3197.3209.3210.3211.3214.3215.3217.3219.3222.3223.3225.3226.3227.3228.3230.3231.3232.3234.3235.3236.3237.3238.3240.3241.3244.3245.3250.3251.3253.3257.3260.3268.3270.3272.3281.3288.3290.3292.3293.3295.3296; TATrkConsent=eyJvdXQiOiIiLCJpbiI6IkFMTCJ9; PAC=AJukZreSlVt2otjGKRNkBz00tWSjLZs1tpXwS8IQ0s9vLyuOrKUvS1c6om5r-WD0fR_Iq3GAZVuS7Hnkp36pQwhrEE0TfQD_2HKg4iY1nBIuQhuDqCWdbnFs1YNDeC2DHqRS5g91y4fgvYu2t67DsbY-k350iSZC1V5Q8MOom6ii; roybatty=TNI1625!APC1CnSJ7d3OhZC8OZmN5URwrla0tLHPbhlztWxjhZhT6aUqZSiQblRTSzow7ftctB099qedPBwThnzphE8mD%2BhqV6BvNYPIhvySRzEFTVzRp06wXxRc8ZRTjzdR%2B6TMtg6r4C0frqplHn1ukZ4jm5nriuS8VgJVY1P1ep6OFaNM%2C1; __vt=wzYY-5Il_FhNE1AIABQCIf6-ytF7QiW7ovfhqc-AvRtk3_lgKJDj5Zq9Ugk-YcW1aWXqbclfQV6lVC3XwLDW4R4P6wRyFZBracNMyGIQ5t0P83yLijLokcFANA9-zVQ698yGW3svmERyK7AnfVnyS4CdjUA; OptanonConsent=isGpcEnabled=0&datestamp=Mon+Feb+14+2022+09%3A54%3A26+GMT%2B0000+(Greenwich+Mean+Time)&version=6.30.0&isIABGlobal=false&hosts=&consentId=bd1952f6-01ff-41b6-861f-a54916a04f3b&interactionCount=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1%2CSTACK42%3A1&geolocation=GB%3BENG&AwaitingReconsent=false; SRT=%1%enc%3A9bZcDoYP6O8GE%2BreSp1djAImFcYdHhqBzveZGrQkjMRhW3dIKV4FZ%2FdZWju6gRL2CeyaC1LeImE%3D; TASession=V2ID.A32BBADDF2344464B10CA8620CFBF2AF*SQ.11*LS.PageMoniker*GR.86*TCPAR.46*TBR.15*EXEX.62*ABTR.9*PHTB.8*FS.28*CPU.39*HS.recommended*ES.popularity*DS.5*SAS.popularity*FPS.oldFirst*FA.1*DF.0*TRA.true*LD.4*EAU._; TAUD=LA-1644829020144-1*RDD-1-2022_02_14*LG-3446857-2.1.F.*LD-3446858-.....; bm_sv=2A36E698463670EE6568739F8CDB1175~3+hcdvQLIRwah/ob3yiC6FDLIUMklns+OmkkhCI+VXdPQ9Cu0Tgp1gj42eltojUxM4qnZc+AQhSLEPtZgkZVPf7jtaIT9dgLdeJFsXrByFiSKAtnDYW8m7bd+9XZCOjX0Vs6okcP/XE3YBv7UlJP6aVMfxFgMK5VPFHA9GoE1IA=',
}
class CruisesSpider(CrawlSpider):
name = 'test'
start_urls = ['https://www.tripadvisor.co.uk/data/graphql/ids']
# custom_settings = {
# 'DOWNLOAD_DELAY':1
# }
def start_requests(self):
for urls in self.start_urls:
for i in range(1, 600):
yield JsonRequest(
url = urls, method = 'POST',callback = self.parse,
headers = headers,
data = [
{
'query': '013d760a68c9a4f77e9a9a903e241eb8',
'variables': {
'page': i,
'limit': 20,
'minPrice': None,
'maxPrice': None,
'order': 'popularity',
'itineraryId': None,
'vendorId': None,
'cruiseLineId': None,
'shipId': None,
'cabinType': None,
'departureDate': None,
'length': None,
'destinationId': [],
'departurePortId': None,
'portId': None,
'cruiseStyleId': None,
'dealId': None,
'viewport': 'small',
'locale': 'en_UK',
'currency': 'GBP',
},
},
],
)
def parse(self, response):
container = response.json()
for results in container:
for data_results in results['data']['cruiseList']['results']:
item = ScraperItem()
title = data_results['title']
item['title'] = [title]
return item
pipelines.py
from itemadapter import ItemAdapter
from cruise_control.models import Cruises
def clean_title(param):
return param
class ScraperPipeline:
def process_item(self, item, spider):
title = clean_title(item['title'])
Cruises.objects.create(
title=title)
return item
items.py
import scrapy
from scrapy_djangoitem import DjangoItem
from cruise_control.models import Cruises
class ScraperItem(DjangoItem):
django_model = Cruises
settings.py
import django
django.setup()
BOT_NAME = 'scraper'
SPIDER_MODULES = ['scraper.spiders']
NEWSPIDER_MODULE = 'scraper.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scraper.pipelines.ScraperPipeline': 100,
}
1条答案
按热度按时间eivgtgni1#
以下是对我有效的方法
我在www.example.com的
scraper
中包含了以下内容settings.py:这将删除我得到的错误。
第二,我得到了一个错误
the module cruises is not found
。所以我不得不为这个设置一个直接路径,使用:并将其保存在相同的设置中。
刮刀现在工作正常。