postgresql 如何有效地选择前一个非空值?

gupuwyp2  于 2023-03-12  发布在  PostgreSQL
关注(0)|答案(8)|浏览(299)

我在Postgres中有一个表,看起来像这样:

# select * from p;
 id | value 
----+-------
  1 |   100
  2 |      
  3 |      
  4 |      
  5 |      
  6 |      
  7 |      
  8 |   200
  9 |          
(9 rows)

我想查询一下,使它看起来像这样:

# select * from p;
 id | value | new_value
----+-------+----------
  1 |   100 |    
  2 |       |    100
  3 |       |    100
  4 |       |    100
  5 |       |    100
  6 |       |    100
  7 |       |    100
  8 |   200 |    100
  9 |       |    200
(9 rows)

我已经可以在select中使用子查询来实现这一点,但是在我的真实的数据中,我有20k或更多的行,这会变得相当慢。
这可以在窗口函数中实现吗?我喜欢使用lag(),但是它似乎不支持IGNORE NULLS选项。

select id, value, lag(value, 1) over (order by id) as new_value from p;
 id | value | new_value
----+-------+-----------
  1 |   100 |      
  2 |       |       100
  3 |       |      
  4 |       |
  5 |       |
  6 |       |
  7 |       |
  8 |   200 |
  9 |       |       200
(9 rows)
nhhxz33t

nhhxz33t1#

我发现this answer for SQL Server也可以在Postgres中工作。我以前从来没有做过,我认为这个技术相当聪明。基本上,他通过使用嵌套查询内的case语句来创建用于开窗函数的定制分区,该嵌套查询当值不为空时递增总和,否则不加处理。这允许用户使用与前一个非空值相同的数字来描述每个空部分。

SELECT
  id, value, value_partition, first_value(value) over (partition by value_partition order by id)
FROM (
  SELECT
    id,
    value,
    sum(case when value is null then 0 else 1 end) over (order by id) as value_partition

  FROM p
  ORDER BY id ASC
) as q

结果是:

id | value | value_partition | first_value
----+-------+-----------------+-------------
  1 |   100 |               1 |         100
  2 |       |               1 |         100
  3 |       |               1 |         100
  4 |       |               1 |         100
  5 |       |               1 |         100
  6 |       |               1 |         100
  7 |       |               1 |         100
  8 |   200 |               2 |         200
  9 |       |               2 |         200
(9 rows)
wwwo4jvm

wwwo4jvm2#

你可以在Postgres中创建一个自定义的聚合函数,下面是int类型的一个例子:

CREATE FUNCTION coalesce_agg_sfunc(state int, value int) RETURNS int AS
$$
    SELECT coalesce(value, state);
$$ LANGUAGE SQL;

CREATE AGGREGATE coalesce_agg(int) (
    SFUNC = coalesce_agg_sfunc,
    STYPE  = int);

然后照常查询。

SELECT *, coalesce_agg(b) over w, sum(b) over w FROM y
  WINDOW w AS (ORDER BY a);

a b coalesce_agg sum 
- - ------------ ---
a 0            0   0
b ∅            0   0
c 2            2   2
d 3            3   5
e ∅            3   5
f 5            5  10
(6 rows)
iyfjxgzm

iyfjxgzm3#

嗯,我不能保证这是最有效的方式,但工作:

SELECT id, value, (
    SELECT p2.value
    FROM p p2
    WHERE p2.value IS NOT NULL AND p2.id <= p1.id
    ORDER BY p2.id DESC
    LIMIT 1
) AS new_value
FROM p p1 ORDER BY id;

以下索引可以改进大型数据集的子查询:

CREATE INDEX idx_p_idvalue_nonnull ON p (id, value) WHERE value IS NOT NULL;

假设value是稀疏的(例如,有很多空值),它将运行良好。

nnt7mjpx

nnt7mjpx4#

另一种可能性是建立一个总和:

WITH CTE_Data(Company, ValueDate, Amount)
AS(
    SELECT 'Company', '2021-05-01', 1000    UNION
    SELECT 'Company', '2021-05-02', 1250    UNION
    SELECT 'Company', '2021-05-03', NULL    UNION
    SELECT 'Company', '2021-05-04', NULL    UNION
    SELECT 'Company', '2021-05-05', 7500    UNION
    SELECT 'Company', '2021-05-06', NULL    UNION
    SELECT 'Company', '2021-05-07', 3200    UNION
    SELECT 'Company', '2021-05-08', 3400    UNION
    SELECT 'Company', '2021-05-09', NULL    UNION
    SELECT 'Company', '2021-05-10', 7800
)

SELECT 
     d.[Company]
    ,d.[ValueDate]
    ,d.[Amount]
    ,d.[Partition]
    ,SUM(d.[Amount]) OVER(PARTITION BY d.[Company], d.[Partition]) AS [Missing]
FROM(
    SELECT
         d.[Company]
        ,d.[ValueDate]
        ,d.[Amount]
        ,SUM(CASE WHEN d.[Amount] IS NULL THEN 0 ELSE 1 END) OVER (PARTITION BY d.[Company] ORDER BY d.[ValueDate]) AS [Partition]
    FROM CTE_Data AS d 
) AS d
4ktjp1zp

4ktjp1zp5#

可以使用ARRAY_AGG模拟LAG(...)/LEAD(...) IGNORE NULLS并访问最后一个NOT NULL值:

WITH cte AS (
  SELECT *,
    ARRAY_REMOVE((ARRAY_AGG(value) OVER(/*PARTITION BY group*/ ORDER BY id)), NULL) 
       AS value_arr
  FROM tab
)
SELECT id, value,
    value_arr[array_upper(value_arr, 1)] AS new_value,
    value_arr -- debug
FROM cte
ORDER BY id;

对于输入数据:

CREATE TABLE tab(id INT, value INT);
INSERT INTO tab(id, value)
VALUES (1,100),(2,NULL),(3,NULL),(4,NULL),(5,NULL),
       (6, NULL),(7,NULL),(8,200),(9,NULL);

精简版:

SELECT *,
    (ARRAY_REMOVE((ARRAY_AGG(value) OVER( ORDER BY id)), NULL))
    [ARRAY_UPPER(ARRAY_REMOVE((ARRAY_AGG(value) OVER( ORDER BY id)), NULL), 1) ]
       AS new_value
FROM tab
ORDER BY id

代码重复看起来不太好,如果有方法使用some_array[-1]访问最后一个元素,会容易得多。

**一个

使用Lukas的建议:

SELECT *,
 to_json(ARRAY_REMOVE((ARRAY_AGG(value) OVER(/*PARTITION BY any_group*/ ORDER BY id))
         ,NULL))->-1 AS value_arr
FROM tab
ORDER BY id;
xkftehaa

xkftehaa6#

在我的情况下,我需要在非交易日维持一个运行平衡,这只是周末,偶尔在非交易假日的情况下,三天的周末
如果空闲天数非常少,可以通过CASE语句和一系列LAG窗口函数来解决此问题:

SELECT
    CASE
        WHEN balance IS NULL THEN
            -- A non-null balance must be found within the first 3 preceding rows
            CASE
                WHEN LAG(balance, 1) OVER () IS NOT NULL
                  THEN LAG(balance, 1) OVER ()
                WHEN LAG(balance, 2) OVER () IS NOT NULL
                  THEN LAG(d.balance, 2) OVER ()
                WHEN LAG(balance, 3) OVER () IS NOT NULL
                  THEN LAG(balance, 3) OVER ()
                END
        ELSE balance
    END
FROM daily_data;

对于无界问题不实用,但对于小的差距是一个很好的解决方案。如果需要的话,只需添加更多的“WHEN LAG(,x)...”子句。我很幸运,我只需要对一列进行此操作,并且此解决方案为我的目标扫清了障碍

nwlqm0z1

nwlqm0z17#

with p (id, value) as (
    values (1, 100),
           (2, null),
           (3, null),
           (4, null),
           (5, null),
           (6, null),
           (7, null),
           (8, 200),
           (9, null))
select *
     , (json_agg(value) filter (where value notnull) over (order by id) ->> -1)::int
from p
;

然后我们将使用聚合函数与过滤器选项。

vltsax25

vltsax258#

您可以将LAST_VALUE与FILTER一起使用来实现您所需要的(至少在PG 9.4中)

WITH base AS (
SELECT 1 AS id , 100 AS val
UNION ALL
SELECT 2 AS id , null AS val
UNION ALL
SELECT 3 AS id , null AS val
UNION ALL
SELECT 4 AS id , null AS val
UNION ALL
SELECT 5 AS id , 200 AS val
UNION ALL
SELECT 6 AS id , null AS val
UNION ALL
SELECT 7 AS id , null AS val
)
SELECT id, val, last(val) FILTER (WHERE val IS NOT NULL) over(ORDER BY id ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) new_val
  FROM base

相关问题