pandas 如何提高这个for循环的执行时间?

xfb7svmp  于 2023-06-28  发布在  其他
关注(0)|答案(2)|浏览(124)

我有一个名为“zone”的DF,其中xy列为整数,可以解释为点的位置。我需要计算第一个和第二个邻居的数量,我写了这个:

import numpy as np
import pandas as pd

data = np.random.randint(1000,6000,size=(600000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

a=[]
for i,row in zone.iterrows(): 
    x = row.x
    y = row.y
    num_1st_neigh = len(zone[(zone.x>=(x-1))&(zone.x<=(x+1))&(zone.y>=(y-1))&(zone.y<=(y+1))])-1
    num_2nd_neigh = (len(zone[(zone.x>=(x-2))&(zone.x<=(x+2))&(zone.y>=(y-2))&(zone.y<=(y+2))])-1)\
    -(num_1st_neigh)
    a.append([i,num_1st_neigh,num_2nd_neigh])
a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
zzz = zone.reset_index().merge(a,on='index')

这工作很好,但持续15秒的3 K点,我有1 M点,它的stll运行后2小时。有什么想法可以提高执行速度吗?
我读到过iterrows非常慢,但我不知道我还能怎么做。
编辑:我也用SQL做了同样的尝试,但执行时间> 2 h,查询返回一个超时:

SELECT t0.x,
    t0.y,
    count_if(greatest(abs(t0.x-t1.x), abs(t0.y-t1.y)) = 1) num_1_neighbors,
    count_if(greatest(abs(t0.x-t1.x), abs(t0.y-t1.y)) = 2) num_2_neighbors
FROM "table" t0 
    left join "table" t1 on t1.x between t0.x -2 and t0.x + 2
    and t1.y between t0.y -2 and t0.y + 2
    and (
        t1.x <> t0.x
        or t1.y <> t0.y
    )
group by 1,2

任何使用SQL或pandas的想法都非常受欢迎

r7knjye2

r7knjye21#

您可以从sklearn使用BallTree

from sklearn.neighbors import BallTree

xy = zone[['x', 'y']]
tree = BallTree(xy, metric='euclidean')
num_1st_neigh = tree.query_radius(xy, r=1*np.sqrt(2), count_only=True) - 1
num_2nd_neigh = tree.query_radius(xy, r=2*np.sqrt(2), count_only=True) - 1 - num_1st_neigh
zone['num_1st_neigh'] = num_1st_neigh
zone['num_2nd_neigh'] = num_2nd_neigh

举个小例子:

# BallTree
>>> zone
     x    y  num_1st_neigh  num_2nd_neigh
0  106  115              0              0
1  118  104              0              0
2  119  114              0              0
3  108  103              0              2
4  103  101              0              0
5  110  105              0              1
6  103  104              0              0
7  102  119              0              0
8  106  105              0              1
9  111  114              0              0

# Your code
>>> zzz
   index    x    y  num_1st_neigh  num_2nd_neigh
0      0  106  115              0              0
1      1  118  104              0              0
2      2  119  114              0              0
3      3  108  103              0              2
4      4  103  101              0              0
5      5  110  105              0              1
6      6  103  104              0              0
7      7  102  119              0              0
8      8  106  105              0              1
9      9  111  114              0              0
j2cgzkjk

j2cgzkjk2#

  • “快速和肮脏”* 的解决方案可以使用经典的2D空间散列Map(实现非常简单):
class SpatialHashMap:
    def __init__(self, power_of_two_bin_size):
        self.shift = power_of_two_bin_size
        self.bins = {}

    def _get_hash(self, x, y):
        return x >> self.shift, y >> self.shift

    def insert(self, point):
        key = self._get_hash(*point)
        if key not in self.bins:
            self.bins[key] = []
        self.bins[key].append(point)

    def get_local_points(self, bounding_box):
        min_x, min_y, max_x, max_y = bounding_box
        min_hash_x, min_hash_y = self._get_hash(min_x, min_y)
        max_hash_x, max_hash_y = self._get_hash(max_x, max_y)

        local_points = []
        for hash_x in range(min_hash_x, max_hash_x + 1):
            for hash_y in range(min_hash_y, max_hash_y + 1):
                if (hash_x, hash_y) in self.bins:
                    local_points.extend(self.bins[(hash_x, hash_y)])

        return local_points

    def restart(self):
        self.bins = {}

    def generate(self, points):
        for point in points:
            self.insert(point)

然后:

np.random.seed(42)

data = np.random.randint(1000,1100,size=(3_000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

m = SpatialHashMap(3)
for point in zip(zone.x, zone.y):
    m.insert(point)

a = []
for i,row in zone.iterrows():
    x, y = row.x, row.y

    bounding_box = x - 2, y - 2, x + 2, y + 2
    points = m.get_local_points(bounding_box)
    num_1st_neigh = sum(max(abs(x - xx), abs(y - yy)) == 1 for xx, yy in points)
    num_2nd_neigh = sum(max(abs(x - xx), abs(y - yy)) == 2 for xx, yy in points)
    a.append([i,num_1st_neigh,num_2nd_neigh])

a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
zzz = zone.reset_index().merge(a,on='index')
print(zzz.head())

这将打印:

index     x     y  num_1st_neigh  num_2nd_neigh
0      0  1051  1092              0              5
1      1  1014  1071              4              4
2      2  1060  1020              4              9
3      3  1082  1086              1              6
4      4  1074  1074              1              2

基准:

import numpy as np
import pandas as pd
from timeit import timeit

class SpatialHashMap:
    def __init__(self, power_of_two_bin_size):
        self.shift = power_of_two_bin_size
        self.bins = {}

    def _get_hash(self, x, y):
        return x >> self.shift, y >> self.shift

    def insert(self, point):
        key = self._get_hash(*point)
        if key not in self.bins:
            self.bins[key] = []
        self.bins[key].append(point)

    def get_local_points(self, bounding_box):
        min_x, min_y, max_x, max_y = bounding_box
        min_hash_x, min_hash_y = self._get_hash(min_x, min_y)
        max_hash_x, max_hash_y = self._get_hash(max_x, max_y)

        local_points = []
        for hash_x in range(min_hash_x, max_hash_x + 1):
            for hash_y in range(min_hash_y, max_hash_y + 1):
                if (hash_x, hash_y) in self.bins:
                    local_points.extend(self.bins[(hash_x, hash_y)])

        return local_points

    def restart(self):
        self.bins = {}

    def generate(self, points):
        for point in points:
            self.insert(point)

np.random.seed(42)

data = np.random.randint(1000,1100,size=(3_000,2))
zone = pd.DataFrame(data, columns=['x', 'y']).drop_duplicates()

def fn1(zone):
    a=[]
    for i,row in zone.iterrows():
        x, y = row.x, row.y
        num_1st_neigh = len(zone[(zone.x>=(x-1))&(zone.x<=(x+1))&(zone.y>=(y-1))&(zone.y<=(y+1))])-1
        num_2nd_neigh = (len(zone[(zone.x>=(x-2))&(zone.x<=(x+2))&(zone.y>=(y-2))&(zone.y<=(y+2))])-1)\
        -(num_1st_neigh)
        a.append([i,num_1st_neigh,num_2nd_neigh])

    a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
    zzz = zone.reset_index().merge(a,on='index')
    return zzz

def fn2(zone):
    m = SpatialHashMap(3)
    for point in zip(zone.x, zone.y):
        m.insert(point)

    a = []
    for i,row in zone.iterrows():
        x, y = row.x, row.y

        bounding_box = x - 2, y - 2, x + 2, y + 2
        points = m.get_local_points(bounding_box)
        num_1st_neigh = sum(max(abs(x - xx), abs(y - yy)) == 1 for xx, yy in points)
        num_2nd_neigh = sum(max(abs(x - xx), abs(y - yy)) == 2 for xx, yy in points)
        a.append([i,num_1st_neigh,num_2nd_neigh])

    a = pd.DataFrame(a, columns = ['index','num_1st_neigh','num_2nd_neigh'])
    zzz = zone.reset_index().merge(a,on='index')

    return zzz

t1 = timeit('fn1(x)', setup='x=zone.copy()', number=1, globals=globals())
t2 = timeit('fn2(x)', setup='x=zone.copy()', number=1, globals=globals())

print(t1)
print(t2)

图纸:

1.9649752220138907
0.15029849001439288

对于data = np.random.randint(1000,6000,size=(1_000_000,2)),空间哈希Map版本在44.175860428018495秒内返回(在我的AMD Ryzen 5700 X/Python 3.10机器上)

相关问题