pandas 如何提高这个for循环的执行时间？

xfb7svmp 于 2023-06-28 发布在其他

关注(0)|答案(2)|浏览(124)

我有一个名为“zone”的DF，其中x和y列为整数，可以解释为点的位置。我需要计算第一个和第二个邻居的数量，我写了这个：

import numpy as np
import pandas as pd

data = np.random.randint(1000,6000,size=(600000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

a=[]
for i,row in zone.iterrows(): 
    x = row.x
    y = row.y
    num_1st_neigh = len(zone[(zone.x>=(x-1))&(zone.x<=(x+1))&(zone.y>=(y-1))&(zone.y<=(y+1))])-1
    num_2nd_neigh = (len(zone[(zone.x>=(x-2))&(zone.x<=(x+2))&(zone.y>=(y-2))&(zone.y<=(y+2))])-1)\
    -(num_1st_neigh)
    a.append([i,num_1st_neigh,num_2nd_neigh])
a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
zzz = zone.reset_index().merge(a,on='index')

这工作很好，但持续15秒的3 K点，我有1 M点，它的stll运行后2小时。有什么想法可以提高执行速度吗？
我读到过iterrows非常慢，但我不知道我还能怎么做。
编辑：我也用SQL做了同样的尝试，但执行时间> 2 h，查询返回一个超时：

SELECT t0.x,
    t0.y,
    count_if(greatest(abs(t0.x-t1.x), abs(t0.y-t1.y)) = 1) num_1_neighbors,
    count_if(greatest(abs(t0.x-t1.x), abs(t0.y-t1.y)) = 2) num_2_neighbors
FROM "table" t0 
    left join "table" t1 on t1.x between t0.x -2 and t0.x + 2
    and t1.y between t0.y -2 and t0.y + 2
    and (
        t1.x <> t0.x
        or t1.y <> t0.y
    )
group by 1,2

任何使用SQL或pandas的想法都非常受欢迎

pandas

来源：https://stackoverflow.com/questions/76558860/how-to-improve-the-time-of-execution-of-this-for-loop

2条答案

按热度按时间

r7knjye21#

您可以从sklearn使用BallTree：

from sklearn.neighbors import BallTree

xy = zone[['x', 'y']]
tree = BallTree(xy, metric='euclidean')
num_1st_neigh = tree.query_radius(xy, r=1*np.sqrt(2), count_only=True) - 1
num_2nd_neigh = tree.query_radius(xy, r=2*np.sqrt(2), count_only=True) - 1 - num_1st_neigh
zone['num_1st_neigh'] = num_1st_neigh
zone['num_2nd_neigh'] = num_2nd_neigh

举个小例子：

# BallTree
>>> zone
     x    y  num_1st_neigh  num_2nd_neigh
0  106  115              0              0
1  118  104              0              0
2  119  114              0              0
3  108  103              0              2
4  103  101              0              0
5  110  105              0              1
6  103  104              0              0
7  102  119              0              0
8  106  105              0              1
9  111  114              0              0

# Your code
>>> zzz
   index    x    y  num_1st_neigh  num_2nd_neigh
0      0  106  115              0              0
1      1  118  104              0              0
2      2  119  114              0              0
3      3  108  103              0              2
4      4  103  101              0              0
5      5  110  105              0              1
6      6  103  104              0              0
7      7  102  119              0              0
8      8  106  105              0              1
9      9  111  114              0              0

赞(0）回复(0）举报 2023-06-28

j2cgzkjk2#

“快速和肮脏”* 的解决方案可以使用经典的2D空间散列Map（实现非常简单）：

class SpatialHashMap:
    def __init__(self, power_of_two_bin_size):
        self.shift = power_of_two_bin_size
        self.bins = {}

    def _get_hash(self, x, y):
        return x >> self.shift, y >> self.shift

    def insert(self, point):
        key = self._get_hash(*point)
        if key not in self.bins:
            self.bins[key] = []
        self.bins[key].append(point)

    def get_local_points(self, bounding_box):
        min_x, min_y, max_x, max_y = bounding_box
        min_hash_x, min_hash_y = self._get_hash(min_x, min_y)
        max_hash_x, max_hash_y = self._get_hash(max_x, max_y)

        local_points = []
        for hash_x in range(min_hash_x, max_hash_x + 1):
            for hash_y in range(min_hash_y, max_hash_y + 1):
                if (hash_x, hash_y) in self.bins:
                    local_points.extend(self.bins[(hash_x, hash_y)])

        return local_points

    def restart(self):
        self.bins = {}

    def generate(self, points):
        for point in points:
            self.insert(point)

然后：

np.random.seed(42)

data = np.random.randint(1000,1100,size=(3_000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

m = SpatialHashMap(3)
for point in zip(zone.x, zone.y):
    m.insert(point)

a = []
for i,row in zone.iterrows():
    x, y = row.x, row.y

    bounding_box = x - 2, y - 2, x + 2, y + 2
    points = m.get_local_points(bounding_box)
    num_1st_neigh = sum(max(abs(x - xx), abs(y - yy)) == 1 for xx, yy in points)
    num_2nd_neigh = sum(max(abs(x - xx), abs(y - yy)) == 2 for xx, yy in points)
    a.append([i,num_1st_neigh,num_2nd_neigh])

a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
zzz = zone.reset_index().merge(a,on='index')
print(zzz.head())

这将打印：

index     x     y  num_1st_neigh  num_2nd_neigh
0      0  1051  1092              0              5
1      1  1014  1071              4              4
2      2  1060  1020              4              9
3      3  1082  1086              1              6
4      4  1074  1074              1              2

基准：

import numpy as np
import pandas as pd
from timeit import timeit

class SpatialHashMap:
    def __init__(self, power_of_two_bin_size):
        self.shift = power_of_two_bin_size
        self.bins = {}

    def _get_hash(self, x, y):
        return x >> self.shift, y >> self.shift

    def insert(self, point):
        key = self._get_hash(*point)
        if key not in self.bins:
            self.bins[key] = []
        self.bins[key].append(point)

    def get_local_points(self, bounding_box):
        min_x, min_y, max_x, max_y = bounding_box
        min_hash_x, min_hash_y = self._get_hash(min_x, min_y)
        max_hash_x, max_hash_y = self._get_hash(max_x, max_y)

        local_points = []
        for hash_x in range(min_hash_x, max_hash_x + 1):
            for hash_y in range(min_hash_y, max_hash_y + 1):
                if (hash_x, hash_y) in self.bins:
                    local_points.extend(self.bins[(hash_x, hash_y)])

        return local_points

    def restart(self):
        self.bins = {}

    def generate(self, points):
        for point in points:
            self.insert(point)

np.random.seed(42)

data = np.random.randint(1000,1100,size=(3_000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

def fn1(zone):
    a=[]
    for i,row in zone.iterrows():
        x, y = row.x, row.y
        num_1st_neigh = len(zone[(zone.x>=(x-1))&(zone.x<=(x+1))&(zone.y>=(y-1))&(zone.y<=(y+1))])-1
        num_2nd_neigh = (len(zone[(zone.x>=(x-2))&(zone.x<=(x+2))&(zone.y>=(y-2))&(zone.y<=(y+2))])-1)\
        -(num_1st_neigh)
        a.append([i,num_1st_neigh,num_2nd_neigh])

    a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
    zzz = zone.reset_index().merge(a,on='index')
    return zzz

def fn2(zone):
    m = SpatialHashMap(3)
    for point in zip(zone.x, zone.y):
        m.insert(point)

    a = []
    for i,row in zone.iterrows():
        x, y = row.x, row.y

        bounding_box = x - 2, y - 2, x + 2, y + 2
        points = m.get_local_points(bounding_box)
        num_1st_neigh = sum(max(abs(x - xx), abs(y - yy)) == 1 for xx, yy in points)
        num_2nd_neigh = sum(max(abs(x - xx), abs(y - yy)) == 2 for xx, yy in points)
        a.append([i,num_1st_neigh,num_2nd_neigh])

    a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
    zzz = zone.reset_index().merge(a,on='index')

    return zzz

t1 = timeit('fn1(x)', setup='x=zone.copy()', number=1, globals=globals())
t2 = timeit('fn2(x)', setup='x=zone.copy()', number=1, globals=globals())

print(t1)
print(t2)

图纸：

1.9649752220138907
0.15029849001439288

对于data = np.random.randint(1000,6000,size=(1_000_000,2))，空间哈希Map版本在44.175860428018495秒内返回（在我的AMD Ryzen 5700 X/Python 3.10机器上）

赞(0）回复(0）举报 2023-06-28

我来回答

pandas 如何提高这个for循环的执行时间？

2条答案

相关问题

热门标签

最新问答