HiveBrain v1.2.0
Get Started
← Back to all entries
snippetMinor

How can I optimize a query with multiple derived tables containing `IN` and `GROUP BY`?

Submitted by: @import:stackexchange-dba··
0
Viewed 0 times
containingcantablesgroupwithqueryoptimizemultiplehowand

Problem

I'm collecting nmap data every five minutes and storing it in a database. Information about each scan (e.g. start and end time) is stored in the scans table:

CREATE TABLE `scans` (
  `scan_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `start_time` datetime NOT NULL,
  `end_time` datetime NOT NULL,
  `nmap_version` varchar(20) DEFAULT NULL,
  `nmap_args` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`scan_id`)
) ENGINE=InnoDB AUTO_INCREMENT=34901 DEFAULT CHARSET=utf8


Information about the hosts scanned (e.g. hostname, MAC address) is stored in the hosts table:

CREATE TABLE `hosts` (
  `scan_id` int(10) unsigned NOT NULL,
  `host_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `hostname` varchar(255) NOT NULL,
  `ip_address` int(10) unsigned NOT NULL,
  `mac_address` bigint(20) unsigned DEFAULT NULL,
  `mac_vendor` varchar(255) DEFAULT NULL,
  `status` varchar(20) NOT NULL,
  `hops` int(10) unsigned DEFAULT NULL,
  `last_boot` datetime DEFAULT NULL,
  PRIMARY KEY (`host_id`),
  KEY `scan_id` (`scan_id`),
  KEY `idx_status` (`status`),
  KEY `idx_hostname` (`hostname`),
  CONSTRAINT `hosts_ibfk_1` FOREIGN KEY (`scan_id`) REFERENCES `scans` (`scan_id`) ON DELETE CASCADE
) ENGINE=InnoDB AUTO_INCREMENT=2262995 DEFAULT CHARSET=utf8


Now I want to fetch the information from the most recent scan of one or more (up to about a thousand) hosts. The most recent scan will not necessarily be the same for all hosts. I also want to fetch the last time each host was up.

I've been using the following query, but it's slow (it takes about six seconds to fetch the data for three hosts):

```
SELECT hosts.hostname,
INET_NTOA(hosts.ip_address) AS ip,
CONV(hosts.mac_address, 10, 16) AS mac,
hosts.mac_vendor AS mac_vendor,
hosts.status AS status,
scans.start_time AS last_scan,
u.last_seen AS last_seen
FROM hosts
JOIN (
-- ID of most recent scan for each host
SELECT MAX(hosts.scan_id) AS max_scan_id,
h

Solution

You can really benefit from an index on hosts (hostname, scan_id) for this query, and possibly another one including status (especially for the second query below). Your query may also benefit from transferring some joins to per-row totals:

CREATE INDEX idx_hostname_scanid ON hosts (hostname, scan_id);
CREATE INDEX idx_hostname_status_scanid ON hosts (hostname, status, scan_id);

SELECT hosts.hostname,
       INET_NTOA(hosts.ip_address) AS ip,
       CONV(hosts.mac_address, 10, 16) AS mac,
       hosts.mac_vendor AS mac_vendor,
       hosts.status AS status,
       scans.start_time AS last_scan,
       (SELECT MAX(scans.start_time)
        FROM hosts
        JOIN scans ON (scans.scan_id = hosts.scan_id)
        WHERE hosts.hostname = t.hostname AND hosts.status = 'up') AS last_seen
FROM (
       -- ID of most recent scan for each host
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz')
       GROUP BY hosts.hostname
     ) t
JOIN hosts ON (hosts.hostname = t.hostname AND hosts.scan_id = t.max_scan_id)
JOIN scans ON (scans.scan_id = t.max_scan_id);


Also, considering that you already trust the last scan to be the one with the highest id, you may speed-up your query by trusting the last_seen time to be the one with the highest id:

CREATE INDEX idx_hostname_scanid ON hosts (hostname, scan_id);
CREATE INDEX idx_hostname_status_scanid ON hosts (hostname, status, scan_id);

SELECT hosts.hostname,
       INET_NTOA(hosts.ip_address) AS ip,
       CONV(hosts.mac_address, 10, 16) AS mac,
       hosts.mac_vendor AS mac_vendor,
       hosts.status AS status,
       scans.start_time AS last_scan,
       lss.start_time AS last_seen
FROM (
       -- ID of most recent scan for each host
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz')
       GROUP BY hosts.hostname
     ) t
JOIN hosts ON (hosts.hostname = t.hostname AND hosts.scan_id = t.max_scan_id)
JOIN scans ON (scans.scan_id = t.max_scan_id)
LEFT JOIN (
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz') AND hosts.status = 'up'
       GROUP BY hosts.hostname
     ) ls ON (ls.hostname = t.hostname)
LEFT JOIN scans lss ON (lss.scan_id = ls.max_scan_id);

Code Snippets

CREATE INDEX idx_hostname_scanid ON hosts (hostname, scan_id);
CREATE INDEX idx_hostname_status_scanid ON hosts (hostname, status, scan_id);

SELECT hosts.hostname,
       INET_NTOA(hosts.ip_address) AS ip,
       CONV(hosts.mac_address, 10, 16) AS mac,
       hosts.mac_vendor AS mac_vendor,
       hosts.status AS status,
       scans.start_time AS last_scan,
       (SELECT MAX(scans.start_time)
        FROM hosts
        JOIN scans ON (scans.scan_id = hosts.scan_id)
        WHERE hosts.hostname = t.hostname AND hosts.status = 'up') AS last_seen
FROM (
       -- ID of most recent scan for each host
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz')
       GROUP BY hosts.hostname
     ) t
JOIN hosts ON (hosts.hostname = t.hostname AND hosts.scan_id = t.max_scan_id)
JOIN scans ON (scans.scan_id = t.max_scan_id);
CREATE INDEX idx_hostname_scanid ON hosts (hostname, scan_id);
CREATE INDEX idx_hostname_status_scanid ON hosts (hostname, status, scan_id);

SELECT hosts.hostname,
       INET_NTOA(hosts.ip_address) AS ip,
       CONV(hosts.mac_address, 10, 16) AS mac,
       hosts.mac_vendor AS mac_vendor,
       hosts.status AS status,
       scans.start_time AS last_scan,
       lss.start_time AS last_seen
FROM (
       -- ID of most recent scan for each host
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz')
       GROUP BY hosts.hostname
     ) t
JOIN hosts ON (hosts.hostname = t.hostname AND hosts.scan_id = t.max_scan_id)
JOIN scans ON (scans.scan_id = t.max_scan_id)
LEFT JOIN (
       SELECT MAX(hosts.scan_id) AS max_scan_id, hosts.hostname
       FROM hosts
       WHERE hosts.hostname IN ('foo', 'bar', 'baz') AND hosts.status = 'up'
       GROUP BY hosts.hostname
     ) ls ON (ls.hostname = t.hostname)
LEFT JOIN scans lss ON (lss.scan_id = ls.max_scan_id);

Context

StackExchange Database Administrators Q#136999, answer score: 3

Revisions (0)

No revisions yet.