HiveBrain v1.2.0
Get Started
← Back to all entries
patternsqlMinor

Detecting inline, inline-compressed and TOAST storage

Submitted by: @import:stackexchange-dba··
0
Viewed 0 times
toastandstorageinlinecompresseddetecting

Problem

Imagine I have a table in Postgres 13 like this:
CREATE TABLE public.people (
id integer PRIMARY KEY,
full_name character varying(255),
bio text
);


I then insert a row with enough characters for the bio to be written to a TOAST table (4000 random bytes that should compress to > 2Kb):
# insert into people values (1, 'joe toast', (SELECT array_to_string(ARRAY(SELECT chr((65 + round(random() * 25)) :: integer) FROM generate_series(1,4000)), '')));
INSERT 0 1


Then insert a row with enough characters for the bio fit inline (3000 repeated bytes that should compress to # insert into people values (2, 'joe compressed', (SELECT array_to_string(ARRAY(SELECT chr(65) FROM generate_series(1,3000)), '')));
INSERT 0 1


Finally insert a row with only a few characters in the bio so it'll store inline (10 repeated bytes):
# insert into people values (3, 'joe inline', 'aaaaaaaaaa');
INSERT 0 1


Is there any way for me to detect the storage strategy for the bio in each tuple? Can I report on the percentage of rows that are inline or in TOAST ("22% of tuples store the bio inline, 78% in TOAST")?

A related question: is there anyway for me to know the number of bytes on disk for the tuple broken down by inline, inline compressed, and TOAST storage?

context: I'm working with a partitioned table that has more than a billion rows in aggregate, and I'm interested in knowing how often a particular column is stored inline vs in TOAST.
Research

I can get the size on disk for each bio, in one case it's clearly the inline-compressed size:
# select id, full_name, pg_column_size(bio) from people order by id;
id | full_name | pg_column_size
----+----------------+----------------
1 | joe toast | 4000
2 | joe compressed | 44
3 | joe inline | 11
(3 rows)


Comparing that size to the size of the uncompressed data tells us something about the compression, but can it tell us anything about the TOAST state?
`# select id,

Solution

About the approach

  • It works with Little Endian byte order. Will have to make it work for Big Endian at some point (tell me if your system is Big Endian)



  • out_of_line means data is stored in TOAST



  • bytes_on_disk and uncompressed_bytes may include some metadata length (either 1 or 4 bytes), need to polish it out some day.



  • It uses inner join people, if you want to see rows that aren't visible (e.g. deleted but not yet vacuumed), use left join people



+--+--------------+------------------+----------+-----------+-------------+
|id|full_name     |uncompressed_bytes|compressed|out_of_line|bytes_on_disk|
+--+--------------+------------------+----------+-----------+-------------+
|1 |joe toast     |4004              |false     |true       |4000         |
|2 |joe compressed|3000              |true      |false      |44           |
|3 |joe inline    |10                |false     |false      |11           |


Implementation

First turn on pageinspect inspection and create functions to get the information from column metadata:

create extension pageinspect;

create or replace function is_toasted(datum_header bytea) returns bool as $ begin
  return get_byte(datum_header, 0) = 1;
end; $ language plpgsql;

create or replace function is_1b_meta(datum_header bytea) returns bool as $ begin
  return not is_toasted(datum_header) and get_byte(datum_header, 0) & 1 > 0;
end; $ LANGUAGE plpgsql;

create or replace function is_compressed(datum_header bytea) returns bool as $ begin
  if(is_1b_meta(datum_header)) then
    return false;
  elsif(not is_toasted(datum_header)) then
    return get_byte(datum_header, 0) & 2 > 0;
  else
    return bytes_on_disk(datum_header)+4 != toasted_original_len(datum_header);
  end if;
end; $ LANGUAGE plpgsql;

create or replace function meta_len(datum_header bytea) returns int as $begin
  if is_1b_meta(datum_header) then return 1;
  else                        return 4;
  end if;
end;$ language plpgsql;

create or replace function bytes_on_disk(datum_header bytea) returns int language plpgsql as $begin
  if(is_1b_meta(datum_header)) then
    return get_byte(datum_header, 0) >> 1;
  elsif(not is_toasted(datum_header)) then
    return (get_byte(datum_header, 0) >> 2)
         | (get_byte(datum_header, 1) > 1;--not needed anymore
  else
    return get_byte(datum_header, 2)
         | (get_byte(datum_header, 3) << 8)
         | (get_byte(datum_header, 4) << 16)
         | (get_byte(datum_header, 5) << 24);
  end if;
end;$;

create or replace function meta_bits(datum_header bytea) returns bit as $
declare
  len int;
  i int;
  res bit varying(32);
begin
  i = 0;
  res = '';
  len = meta_len(datum_header);
  while i < len loop
    res = res || get_byte(datum_header, i)::bit(8);
    i = i+1;
  end loop;
  return res;
end; $ language plpgsql;


Now you can select some column ([3] means column #3), take binary data and parse the header:

with bits as(
  select t_ctid as ctid,
         (tuple_data_split('people'::regclass, t_data, t_infomask, t_infomask2, t_bits))[3] as bits
  from generate_series(0, (select max((ctid::text::point)[0]::int) from people)) as page,
  lateral heap_page_items(get_raw_page('people', page))
)
select p.id, p.full_name,
       case when is_toasted(bits) then toasted_original_len(bits)
            else                       length(p.bio)
       end as uncompressed_bytes,
       --meta_bits(bits),
       is_compressed(bits) compressed, is_toasted(bits) out_of_line,
       bytes_on_disk(bits)
from bits
inner join people p on p.ctid=bits.ctid;


Inside Postgres

This information is determined from what Postgres stores internally. There are 3 options for varlena (fields of variable length) metadata (code, docs, presentation):

  • 1 byte. The data itself is inlined, 126 bytes max. Never compressed.



  • 4 bytes. The data is inlined, could be either compressed or not.



  • 18 bytes (1st byte has only 1 bit set) with the data stored in TOAST. Could be either compressed or not.

Code Snippets

+--+--------------+------------------+----------+-----------+-------------+
|id|full_name     |uncompressed_bytes|compressed|out_of_line|bytes_on_disk|
+--+--------------+------------------+----------+-----------+-------------+
|1 |joe toast     |4004              |false     |true       |4000         |
|2 |joe compressed|3000              |true      |false      |44           |
|3 |joe inline    |10                |false     |false      |11           |
create extension pageinspect;

create or replace function is_toasted(datum_header bytea) returns bool as $$ begin
  return get_byte(datum_header, 0) = 1;
end; $$ language plpgsql;

create or replace function is_1b_meta(datum_header bytea) returns bool as $$ begin
  return not is_toasted(datum_header) and get_byte(datum_header, 0) & 1 > 0;
end; $$ LANGUAGE plpgsql;

create or replace function is_compressed(datum_header bytea) returns bool as $$ begin
  if(is_1b_meta(datum_header)) then
    return false;
  elsif(not is_toasted(datum_header)) then
    return get_byte(datum_header, 0) & 2 > 0;
  else
    return bytes_on_disk(datum_header)+4 != toasted_original_len(datum_header);
  end if;
end; $$ LANGUAGE plpgsql;

create or replace function meta_len(datum_header bytea) returns int as $$begin
  if is_1b_meta(datum_header) then return 1;
  else                        return 4;
  end if;
end;$$ language plpgsql;

create or replace function bytes_on_disk(datum_header bytea) returns int language plpgsql as $$begin
  if(is_1b_meta(datum_header)) then
    return get_byte(datum_header, 0) >> 1;
  elsif(not is_toasted(datum_header)) then
    return (get_byte(datum_header, 0) >> 2)
         | (get_byte(datum_header, 1) << 6)
         | (get_byte(datum_header, 2) << 14)
         | (get_byte(datum_header, 3) << 22);
  else
    return get_byte(datum_header, 6)
         | (get_byte(datum_header, 7) << 8)
         | (get_byte(datum_header, 8) << 16)
         | (get_byte(datum_header, 9) << 24);
  end if;
end;$$;

create or replace function toasted_original_len(datum_header bytea) returns integer language plpgsql as $$ begin
  if(not is_toasted(datum_header)) then
    return get_byte(datum_header, 0) >> 1;--not needed anymore
  else
    return get_byte(datum_header, 2)
         | (get_byte(datum_header, 3) << 8)
         | (get_byte(datum_header, 4) << 16)
         | (get_byte(datum_header, 5) << 24);
  end if;
end;$$;

create or replace function meta_bits(datum_header bytea) returns bit as $$
declare
  len int;
  i int;
  res bit varying(32);
begin
  i = 0;
  res = '';
  len = meta_len(datum_header);
  while i < len loop
    res = res || get_byte(datum_header, i)::bit(8);
    i = i+1;
  end loop;
  return res;
end; $$ language plpgsql;
with bits as(
  select t_ctid as ctid,
         (tuple_data_split('people'::regclass, t_data, t_infomask, t_infomask2, t_bits))[3] as bits
  from generate_series(0, (select max((ctid::text::point)[0]::int) from people)) as page,
  lateral heap_page_items(get_raw_page('people', page))
)
select p.id, p.full_name,
       case when is_toasted(bits) then toasted_original_len(bits)
            else                       length(p.bio)
       end as uncompressed_bytes,
       --meta_bits(bits),
       is_compressed(bits) compressed, is_toasted(bits) out_of_line,
       bytes_on_disk(bits)
from bits
inner join people p on p.ctid=bits.ctid;

Context

StackExchange Database Administrators Q#301669, answer score: 6

Revisions (0)

No revisions yet.