带权重的sql表连接

0yg35tkg  于 2021-07-26  发布在  Java
关注(0)|答案(1)|浏览(266)

我正在尝试创建一个表,该表将计算客户在一段时间内购物的月份中的支出权重。例如,以下客户(假数据)具有此支出配置文件:

/* Customer spend */
create or replace temp table ts_all_transactions
(
     inferred_customer_id varchar(128)
    ,nw_date date
    ,spend number(21,2)
);

insert into ts_all_transactions
    values
     ('52f5','2019-06-01',17.35)
    ,('52f5','2018-11-01',24.85)
    ,('52f5','2019-12-01',1.40)
    ,('52f5','2019-01-01',2.45)
    ,('52f5','2019-03-01',3.90)
    ,('52f5','2020-01-01',37.55)
    ,('52f5','2019-10-01',13.20)
    ,('52f5','2019-09-01',5.70)
;

然后创建一个日历,其中包含某个期间所在的月份以及权重:

-- Calculate weights for each period of the time series
-- Create a staging table
create or replace temp table period_dimension as
    select abs(seq4()-12) as period,
           dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
           dateadd(month, 12, start_date) as end_date
    from table(generator(rowcount => 12))  -- number of months after reference date in previous line
;

select * from period_dimension;

create or replace temp table my_date_dimension
(
     my_date          date        not null
    ,year             smallint    not null
    ,month            smallint    not null
    ,month_name       char(3)     not null
    ,day_of_mon       smallint    not null
    ,day_of_week      varchar(9)  not null
    ,week_of_year     smallint    not null
    ,day_of_year      smallint    not null
)
as
  with my_date as (
    select
           seq4(),
           dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as my_date
      from table(generator(rowcount=>23))
  )
  select  my_date
        ,year(my_date)
        ,month(my_date)
        ,monthname(my_date)
        ,day(my_date)
        ,dayofweek(my_date)
        ,weekofyear(my_date)
        ,dayofyear(my_date)
    from my_date
;

create or replace table weight_lookup as
    select
         a.period
        ,b.my_date
        ,rank() over (partition by a.period order by b.my_date) as weight
    from period_dimension a
    inner join my_date_dimension b
    where b.my_date >= a.start_date
    and b.my_date < a.end_date
    order by 1,2
;

-- Create a staging table
create or replace temp table period_dimension2 as
    select abs(seq4()-12) as period,
           dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
           last_day(dateadd(month, 11, start_date)) as end_date
      from table(generator(rowcount => 12))  -- number of months after reference date in previous line
;

然后,使用上面的公式来计算一个基于客户在这段时间内购物的月份的平均花费,但是,我没有得到我期望的结果:

-- For each month of each period, group all together by period here so we have 12 periods
-- so each period represents 12 rolling months with period 12 being the oldest period
create or replace temp table ts_spend_time as
    select
         a.inferred_customer_id
        ,b.period
        ,max(a.nw_date) as max_mnth /* Month in period where most spend was made */
        ,sum(a.spend * b.weight) / 78 as avg_spend /* Sum of weights 12,11,10...1 to give 78 */

    from ts_all_transactions a
    inner join weight_lookup b on a.nw_date = b.my_date
    inner join period_dimension2 c on b.my_date = c.start_date and b.period = c.period

    where b.my_date >= c.start_date
    and b.my_date <= c.end_date

    group by 1,2
    order by 1 desc, 2,3
;

我从上述代码得到的输出如下:

create or replace temp table ts_spend_time_wrong_out
(
     inferred_customer_id varchar(128)
    ,period number(11)
    ,max_mnth date
    ,avg_spend number(38,8)
);

insert into ts_spend_time_wrong_out
  values
 ('52f5',3,'2019-03-01',0.05000000)
,('52f5',5,'2019-01-01',0.03141026)
,('52f5',7,'2018-11-01',0.31858974)
;

我想得到这样的输出:

create or replace temp table ts_spend_time_should_be
(
     inferred_customer_id varchar(128)
    ,period number(11)
    ,max_mnth date
    ,avg_spend number(38,8)
);

insert into ts_spend_time_should_be
  values
     ('52f5',1,'01JAN2020',6.301923077)
    ,('52f5',2,'01JAN2020',7.266025641)
    ,('52f5',3,'01JAN2020',8.280128205)
    ,('52f5',4,'01JAN2020',9.294230769)
    ,('52f5',5,'01DEC2019',4.081410256)
    ,('52f5',6,'01OCT2019',4.412179487)
    ,('52f5',7,'01OCT2019',5.276923077)
    ,('52f5',8,'01SEP2019',3.941666667)
    ,('52f5',9,'01JUN2019',3.687179487)
    ,('52f5',10,'01JUN2019',4.309615385)
    ,('52f5',11,'01JUN2019',4.932051282)
    ,('52f5',12,'01MAR2019',2.662820513)
;

在正确的解决方案示例中,平均支出按时段计算如下:((17.352)+(5.75)+(13.206)+(1.48)+(37.55*9))/78
我如何解决这个问题?短暂性脑缺血发作

cwdobuhd

cwdobuhd1#

首先你应该使用 row_number() over(order by seq4()) 因为在一个 seq() 你的问题已经解决了一半

with ts_all_transactions as (
  select id, nw_date::date as nw_date, spend from values
     ('52f5','2019-06-01',17.35)
    ,('52f5','2018-11-01',24.85)
    ,('52f5','2019-12-01',1.40)
    ,('52f5','2019-01-01',2.45)
    ,('52f5','2019-03-01',3.90)
    ,('52f5','2020-01-01',37.55)
    ,('52f5','2019-10-01',13.20)
    ,('52f5','2019-09-01',5.70)
   v(id,nw_date, spend)
), period_dimension as (
    select 
        row_number() over(order by seq4())-1 as rn0,
        abs(rn0-12) as period,
           dateadd('month', rn0, dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
           dateadd('month', 12, start_date) as end_date
    from table(generator(rowcount => 12))  -- number of months after reference date in previous line
), weight_periods as (
    select p.period
        ,p.start_date
        ,p.end_date
        ,row_number() over(partition by p.period order by seq4())-1 as rn1
        ,dateadd('month',-rn1, p.end_date ) as weight_month
        ,12 - rn1 + 1 as weight
    from period_dimension p,
        table(generator(rowcount => 12))
), monthly_spends as (
    select id
        ,date_trunc('month', nw_date) as m_date
        ,sum(spend) as s_spend
    from ts_all_transactions
    group by 1,2
)
select m.id
    ,w.period
    ,w.end_date
    ,w.weight_month
    ,m.s_spend
    ,w.weight
    ,m.s_spend * w.weight as w_spend
  from monthly_spends m
  join weight_periods w on m.m_date = w.weight_month 
  order by 1,2,3,4;

给予:

ID  PERIOD  END_DATE    WEIGHT_MONTH    S_SPEND WEIGHT  W_SPEND
    52f5    1   2020-05-01  2019-06-01      17.35   2       34.70
    52f5    1   2020-05-01  2019-09-01      5.70    5       28.50
    52f5    1   2020-05-01  2019-10-01      13.20   6       79.20
    52f5    1   2020-05-01  2019-12-01      1.40    8       11.20
    52f5    1   2020-05-01  2020-01-01      37.55   9       337.95
    52f5    2   2020-04-01  2019-06-01      17.35   3       52.05
    ...

到目前为止,我们可以看到“加权平均”值的输入,可以通过以下方式完成:

select m.id
    ,w.period
    ,sum(m.s_spend * w.weight) as t_w_spend
    ,round(t_w_spend / 78,3) as weighted_avg_spend 
  from monthly_spends m
  join weight_periods w on m.m_date = w.weight_month 
  group by 1,2
  order by 1,2;

它给出:

ID  PERIOD  T_W_SPEND   WEIGHTED_AVG_SPEND
52f5    1   491.55  6.302
52f5    2   566.75  7.266
52f5    3   641.95  8.230
52f5    4   724.95  9.294
52f5    5   804.05  10.308
52f5    6   362.35  4.646
52f5    7   386.75  4.958
52f5    8   479.05  6.142
52f5    9   361.70  4.637
52f5    10  336.15  4.310
52f5    11  384.70  4.932
52f5    12  433.25  5.554

一开始是一样的,但有分歧,因为我认为你的约会周期是“错误的”
但下一点是你有这条线

,max(a.nw_date) as max_mnth /* Month in period where most spend was made */

但这并不像你说的那样。。它所做的是在聚合中查找最大日期值。
要做到这一点,您需要返回到monthly results sql并将第一个\u值()放入混合中,然后通过以下方式选择结果:

id, period, max_spend_month,  sum(w_spend)/78 as weighted_avg_spend
  from (
      select m.id
      ,w.period
      ,w.end_date
      ,w.weight_month
      ,m.s_spend
      ,w.weight
      ,m.s_spend * w.weight as w_spend
      ,first_value(w.weight_month) over (partition by m.id, w.period order by m.s_spend desc) as max_spend_month
    from monthly_spends m
    join weight_periods w on m.m_date = w.weight_month 
  )
  group by 1,2,3
  order by 1,2;

现在符合您的期望:

ID  PERIOD  MAX_SPEND_MONTH WEIGHTED_AVG_SPEND
52f5    1   2020-01-01  6.30192308
52f5    2   2020-01-01  7.26602564
52f5    3   2020-01-01  8.23012821
52f5    4   2020-01-01  9.29423077
52f5    5   2020-01-01  10.30833333
52f5    6   2019-06-01  4.64551282
52f5    7   2019-06-01  4.95833333
52f5    8   2018-11-01  6.14166667
52f5    9   2018-11-01  4.63717949
52f5    10  2018-11-01  4.30961538
52f5    11  2018-11-01  4.93205128
52f5    12  2018-11-01  5.55448718

相关问题