snippetsqlMinor
How to optimize a query with a subquery based on a join
Viewed 0 times
withqueryjoinsubqueryoptimizebasedhow
Problem
I have this query, that finds all active contracts for candidate who have had less than 20 total completed contracts, or less than 5 for the contract site.
~45% of the query time is caused by a sequence scan to match the 2nd subquery. Is there any way to speed up that subquery?
The query plan is below;
```
Hash Semi Join (cost=1714368.60..1714534.45 rows=21 width=1458) (actual time=12317.173..13364.060 rows=103 loops=1)
Output: contracts.id, contracts.role_id, contracts.candidate_id, contracts.name, contracts.internal_name, contracts.description, contracts.created_at, contracts.updated_at, contracts.start_date, contracts.hourly_rate_pennies, contracts.hourly_rate_currency, contracts.state, contracts.first_seen, contracts.sent, contracts.waitlist, contracts.date_time_info, contracts.dress_code, contracts.why_work_here, contracts.reason_for_cancelling, contracts.reason_for_rejecting, contracts.d
EXPLAIN (ANALYZE,COSTS, VERBOSE, BUFFERS, FORMAT JSON)
SELECT "CONTRACTS".* FROM "CONTRACTS"
WHERE "CONTRACTS"."ID" IN (
SELECT "CONTRACTS"."ID" FROM "CONTRACTS"
INNER JOIN "ROLES" ON "ROLES"."ID" = "CONTRACTS"."ROLE_ID"
WHERE (
CANDIDATE_ID NOT IN (
SELECT DISTINCT "CONTRACTS"."CANDIDATE_ID" FROM "CONTRACTS"
WHERE "CONTRACTS"."STATE" IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED')
GROUP BY "CONTRACTS"."CANDIDATE_ID"
HAVING (COUNT(CONTRACTS.ID) >= 20)
) AND (CANDIDATE_ID, ROLES.SITE_ID) NOT IN (
SELECT DISTINCT "CONTRACTS"."CANDIDATE_ID", "ROLES"."SITE_ID" FROM "CONTRACTS"
INNER JOIN "ROLES" ON "ROLES"."ID" = "CONTRACTS"."ROLE_ID"
WHERE "CONTRACTS"."STATE" IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED')
GROUP BY contracts.CANDIDATE_ID, ROLES.SITE_ID
HAVING (COUNT(ROLES.SITE_ID) >= 5)
)
)
) AND "CONTRACTS"."STATE" = 'ACTIVE' AND (CONTRACTS.START_DATETIME >= '2018-06-16 13:16:09.013943' AND CONTRACTS.START_DATETIME <= '2018-06-16 17:16:09.015998')~45% of the query time is caused by a sequence scan to match the 2nd subquery. Is there any way to speed up that subquery?
The query plan is below;
```
Hash Semi Join (cost=1714368.60..1714534.45 rows=21 width=1458) (actual time=12317.173..13364.060 rows=103 loops=1)
Output: contracts.id, contracts.role_id, contracts.candidate_id, contracts.name, contracts.internal_name, contracts.description, contracts.created_at, contracts.updated_at, contracts.start_date, contracts.hourly_rate_pennies, contracts.hourly_rate_currency, contracts.state, contracts.first_seen, contracts.sent, contracts.waitlist, contracts.date_time_info, contracts.dress_code, contracts.why_work_here, contracts.reason_for_cancelling, contracts.reason_for_rejecting, contracts.d
Solution
The query below uses EXISTS and NOT EXISTS instead of IN and NOT IN.
I think it should give the results you're aiming for.
But I could only test it on a small set of sample data.
So you'd have to test yourself :
1) if it's correct
2) if it has better performance
Also note that the timestamp criteria aren't hardcoded in it.
But you can just replace those with hardcoded values.
(And remove the TST_ from the table names)
Sample data:
I think it should give the results you're aiming for.
But I could only test it on a small set of sample data.
So you'd have to test yourself :
1) if it's correct
2) if it has better performance
Also note that the timestamp criteria aren't hardcoded in it.
But you can just replace those with hardcoded values.
(And remove the TST_ from the table names)
SELECT c.*
FROM TST_CONTRACTS c
WHERE c.START_DATETIME BETWEEN (CURRENT_TIMESTAMP - INTERVAL '4 hour') AND (CURRENT_TIMESTAMP - INTERVAL '1 hour')
AND c.STATE = 'ACTIVE'
AND EXISTS (
SELECT r.SITE_ID
FROM TST_ROLES r
WHERE (r.ID = c.ROLE_ID)
AND NOT EXISTS (
SELECT c2.CANDIDATE_ID
FROM TST_CONTRACTS c2
JOIN TST_ROLES r2 ON (r2.ID = c2.ROLE_ID)
WHERE c2.STATE IN ('ARCHIVED', 'PENDING_APPROVAL', 'APPROVED', 'AUTO_APPROVED')
AND (c2.CANDIDATE_ID = c.CANDIDATE_ID)
GROUP BY c2.CANDIDATE_ID
HAVING
( COUNT(DISTINCT c2.ID) >= 20
OR COUNT(DISTINCT CASE WHEN r2.SITE_ID = r.SITE_ID THEN c2.ID END) >= 5
)
)
);
Sample data:
CREATE TABLE TST_CONTRACTS (ID INT, ROLE_ID INT, CANDIDATE_ID INT, STATE VARCHAR(30), START_DATETIME TIMESTAMP);
CREATE TABLE TST_ROLES (ID INT, SITE_ID INT);
INSERT INTO TST_CONTRACTS (ID, ROLE_ID, CANDIDATE_ID, STATE, START_DATETIME) VALUES
(51,11,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(52,12,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(11,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(12,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(13,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(14,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(21,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(22,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(23,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(24,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(25,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(26,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(27,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(28,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(29,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(30,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(31,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(32,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(33,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(34,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(35,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(36,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
;
INSERT INTO TST_ROLES (ID, SITE_ID) VALUES
(11,1001),(12,1002),(13,1002)
;Code Snippets
CREATE TABLE TST_CONTRACTS (ID INT, ROLE_ID INT, CANDIDATE_ID INT, STATE VARCHAR(30), START_DATETIME TIMESTAMP);
CREATE TABLE TST_ROLES (ID INT, SITE_ID INT);
INSERT INTO TST_CONTRACTS (ID, ROLE_ID, CANDIDATE_ID, STATE, START_DATETIME) VALUES
(51,11,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(52,12,101,'ACTIVE',(CURRENT_TIMESTAMP - INTERVAL '2 hour'))
,(11,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(12,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(13,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(14,11,101,'APPROVED',(CURRENT_TIMESTAMP - INTERVAL '1 day'))
,(21,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(22,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(23,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(24,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(25,12,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(26,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(27,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(28,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(29,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(30,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(31,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(32,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(33,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(34,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
,(35,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
-- ,(36,13,101,'ARCHIVED',(CURRENT_TIMESTAMP - INTERVAL '10 day'))
;
INSERT INTO TST_ROLES (ID, SITE_ID) VALUES
(11,1001),(12,1002),(13,1002)
;Context
StackExchange Database Administrators Q#209854, answer score: 2
Revisions (0)
No revisions yet.