我想从网站上报废的文本和桶它根据我的需要。想做的Python使用谷歌AI/ML服务
我从零开始尝试:
import requests
from bs4 import BeautifulSoup
def scrape_website(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
else:
print(f"Error: Unable to fetch the URL. Status code: {response.status_code}")
return None
def extract_information(soup, query):
# Your HTML parsing logic here to extract information based on the query
# For demonstration, let's extract the title of the page
if query.lower() == "project name":
project_name = soup.title.text.strip()
return f"Project Name: {project_name}"
else:
return "Query not supported."
if __name__ == "__main__":
url = input("Enter the URL: ")
# Scrape website content
webpage_content = scrape_website(url)
if webpage_content:
while True:
query = input("Enter your question (e.g., 'Project Name', 'Status'): ")
if query.lower() == "exit":
break
result = extract_information(webpage_content, query)
print(result)
字符串
上面的代码给了我下面给出的输出,但它没有达到我的期望:
输入URL:https://h2v.eu/hydrogen-valleys/crystal-brook-hydrogen-superhub
输入您的问题(例如,“项目名称”,“状态”):项目名称
项目名称:氢谷|水晶溪氢超级枢纽
输入您的问题(例如,“项目名称”,“状态”):状态
不支持查询。
我也试过:
import tkinter as tk
from tkinter import ttk
from bs4 import BeautifulSoup
from google.cloud import language_v1
import requests
def scrape_and_analyze(url):
# Web scraping
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text_content = soup.get_text()
except Exception as e:
return f"Error in web scraping: {str(e)}"
# Google Cloud Natural Language API
try:
client = language_v1.LanguageServiceClient()
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
annotations = client.analyze_entities(document=document)
entities = annotations.entities
except Exception as e:
return f"Error in text analysis: {str(e)}"
# Filter entities of interest (customize this based on your needs)
filtered_entities = [entity.name for entity in entities if entity.type_ == language_v1.Entity.Type.PERSON]
return filtered_entities
def on_submit():
url = url_entry.get()
result = scrape_and_analyze(url)
result_text.delete(1.0, tk.END)
result_text.insert(tk.END, "\n".join(result))
# UI Setup
root = tk.Tk()
root.title("Web Scraping and Text Analysis")
# URL Entry
url_label = ttk.Label(root, text="Enter URL:")
url_label.pack(pady=10)
url_entry = ttk.Entry(root, width=50)
url_entry.pack(pady=10)
# Submit Button
submit_button = ttk.Button(root, text="Submit", command=on_submit)
submit_button.pack(pady=10)
# Result Text
result_text = tk.Text(root, height=10, width=50, wrap="word")
result_text.pack(pady=10)
# END
root.mainloop()
型
这也是一种错误。
1条答案
按热度按时间anauzrmj1#
网络抓取涉及从网站中提取数据,它是一个有用的工具。在抓取之前,请务必检查网站的
robots.txt
文件和服务条款。下面是一个简单的例子,使用Python和
BeautifulSoup
库进行HTML解析,requests
库进行HTTP请求。请确保首先安装这些库:字符串
现在,您可以使用以下示例作为Web抓取的起点:
型
记住:
robots.txt
。