hydradatabase/hydra

LEFT JOIN has poor/inconsistent performance

muntdan opened this issue · 5 comments

What's wrong?

I have 2 CTE and than a UNION ALL.
WITH CTE1 as (SELECT * from Table1 Left Join Table2 where targetid in (1,2,3)), CTE2 as (SELECT col1 from CTE1, left join lateral (select col2 from table2 where targetid =cte1.targetid) on true where point is null ) --this one has 0 ROWS SELECT * FROM CTE1 WHERE point IS NOT NULL UNION ALL SELECT * FROM CTE2

all selects are < 1 second and without UNION ALL separately they work ok.
When UNION ALL is applied a CTE scan operation makes a Index scan on Table2 and query takes > 6 seconds

Re-categorizing as a possible performance enhancement, though this sounds like a planner issue? I think you could force a table scan using pg_hint_plan, so perhaps there is a workaround for you.

Please let us know if the query results were inaccurate, in which case it's certainly a bug we want to investigate.

Strange enough this now takes 6 seconds no matter with or without the union so it might be from the join inside the first CTE which separately take ~1 sec but when joined they take 6 seconds

Join performance is highly dependent on a lot of factors; there's nothing specific enough here for us to look into. If you can provide an example/test case we could investigate further.

Here are the scripts to reproduce:

CREATE TABLE ten (id int, tenant text, targetid int);
INSERT INTO ten VALUES (1, 'a', 101),(2, 'a', 102),(3, 'b', 201),(4, 'b', 202),
(5, 'c', 301),(6, 'c', 302),(7, 'd', 401),(8, 'd', 402),(9, 'e', 501),(10, 'e', 502);

CREATE TABLE tp (id int, targetid int, tenant text, datetime timestamp without time zone, speed int,
				CONSTRAINT tp_pkey PRIMARY KEY (id, tenant, datetime) ) PARTITION BY LIST (tenant);
DO $$
DECLARE t text; d timestamp;
BEGIN 
FOR t IN SELECT distinct tenant FROM ten LOOP
	    raise notice 'tp_ten%', t;
        EXECUTE 'CREATE TABLE '|| 'tp_ten' || t || ' PARTITION OF tp FOR VALUES IN  ('''|| t ||''') PARTITION BY RANGE(datetime);';
		FOR d IN SELECT * FROM  generate_series('2023-12-01 00:00:00'::timestamp, '2023-12-31 23:59:59'::timestamp, '1 hour') LOOP
			raise notice 'day_hour:%', TO_CHAR( d, 'DD_HH24' );
			EXECUTE 'CREATE TABLE '|| 'tp_ten' || t || '_' || TO_CHAR( d, 'DD_HH24' ) || ' PARTITION OF ' 
				|| 'tp_ten' || t || ' FOR VALUES FROM (''' || d || ''') TO ('''|| d + INTERVAL '1 hour' ||''') USING columnar;';
		END LOOP;
    END LOOP;
END $$;
		 
INSERT INTO tp 
SELECT rnd.id, ten.targetid,ten.tenant, rnd.datetime, rnd.speed from
(SELECT row_number() over () as id,
generate_series('2023-12-01 00:00:00', '2023-12-31 23:59:59', '1 sec'::interval)::timestamp as datetime,
floor(random() * 100 + 1)::int as speed,
floor(random() * 10 + 1)::int as random) rnd inner join ten on rnd.random=ten.id;
select count(*) from tp --2.678.400
CREATE TABLE ev (id int, targetid int, tenant text, datetime timestamp without time zone, code text,
				CONSTRAINT ev_pkey PRIMARY KEY (id, tenant, datetime) ) PARTITION BY LIST (tenant);
DO $$
DECLARE t text; d timestamp;
BEGIN 
FOR t IN SELECT distinct tenant FROM ten LOOP
	    raise notice 'ev_ten%', t;
        EXECUTE 'CREATE TABLE '|| 'ev_ten' || t || ' PARTITION OF ev FOR VALUES IN  ('''|| t ||''') PARTITION BY RANGE(datetime);';
		FOR d IN SELECT * FROM  generate_series('2023-12-01 00:00:00'::timestamp, '2023-12-31 23:59:59'::timestamp, '1 hour') LOOP
			raise notice 'day_hour:%', TO_CHAR( d, 'DD_HH24' );
			EXECUTE 'CREATE TABLE '|| 'ev_ten' || t || '_' || TO_CHAR( d, 'DD_HH24' ) || ' PARTITION OF ' 
				|| 'ev_ten' || t || ' FOR VALUES FROM (''' || d || ''') TO ('''|| d + INTERVAL '1 hour' ||''') USING columnar;';
		END LOOP;
    END LOOP;
END $$;				
	
INSERT INTO ev 
SELECT row_number() over () as id, t.targetid, t.tenant, t.datetime + floor(random() * 5)::int * interval '1 seconds' ,
(array['start','stop','break','engineon','engineoff'])[floor(random() * 5 + 1)] as code
FROM ( SELECT *, row_number() OVER(ORDER BY id ASC) AS row FROM tp) t
WHERE t.row % 5 = 0;
select count(*) from ev --535.680

--1.4sec:
SELECT tp.targetid, tp.datetime, tp.speed  FROM tp WHERE tp.tenant = 'a' AND tp.datetime between '2023-12-01 00:00:00' AND '2023-12-31 23:59:59' AND tp.targetid IN (102)
--1 sec:
SELECT ev.targetid, ev.datetime, ev.code FROM ev WHERE ev.tenant = 'a' AND ev.datetime between '2023-12-01 00:00:00' AND '2023-12-31 23:59:59' AND ev.targetid IN (102)

--16sec:
   SELECT ev.targetid, ev.datetime, ev.code, tp.speed 
   FROM ev  LEFT JOIN  tp on  ev.targetid = tp.targetid AND ev.datetime = tp.datetime  AND ev.tenant = tp.tenant    
	WHERE ev.tenant = 'a' AND ev.datetime between '2023-12-01 00:00:00' AND '2023-12-31 23:59:59' AND ev.targetid IN (102)
	AND tp.tenant = 'a' AND tp.datetime between '2023-12-01 00:00:00' AND '2023-12-31 23:59:59' AND tp.targetid IN (102)

Do you need any more information from me ?