import dask.dataframe as dd
def multi_str_contains(dataframe, sub_col, super_col, new_col):
results = []
sub_col_unique = dataframe[sub_col].unique()
if len(sub_col_unique)==0:
return None
def in_sec(sub):
sec = dataframe[dataframe[sub_col]==sub].copy()
sec[new_col] = sec[super_col].str.contains(sub, regex=False)
return sec
vals = [v for v in [in_sec(sub) for sub in sub_col_unique] if v is not None]
if len(vals)==0:
return None
return dd.concat(vals)
1条答案
按热度按时间qc6wkl3g1#
方法1:对每个唯一的子字符串执行操作
这种方法在Pandas身上效果更好,因为Pandas的任务数量非常大。