import pandas as pd
import os
# Step 1: Import the necessary libraries
# Step 2: Read the raw dataset and clean dataset into data frames
raw_df = pd.read_csv("raw_dataset.csv")
clean_df = pd.read_csv("clean_dataset.csv")
# Step 3: Calculate the row count for both datasets
raw_row_count = raw_df.shape[0]
clean_row_count = clean_df.shape[0]
# Step 4: Calculate the percentage difference
percentage_difference = (raw_row_count - clean_row_count) / raw_row_count * 100
# Step 5: Compare the percentage difference with the threshold and take the appropriate action
threshold = 1.0 # Adjust this threshold as needed
if percentage_difference < threshold:
print("Aborting build: Raw dataset is less than 1% of the previous dataset.")
os._exit(1) # Abort the build process
else:
print("Data expectation check passed.")
1条答案
按热度按时间cigdeys31#
试试这个代码:
请记住将“raw_dataset.csv”和“clean_dataset.csv”替换为原始数据集和干净数据集的实际路径。
确保您对Foundry环境中的数据集具有必要的权限和访问权限,以执行这些操作。
请注意,此代码假定数据集为CSV格式。如果数据集的格式不同,则可能需要相应地调整**pd.read_csv()**调用。