patternsqlMinor
Parsing JSON data from a text column in Postgres
Viewed 0 times
postgrescolumntextparsingjsonfromdata
Problem
I have
and I have loaded this to one temp table:
Now I want to extract this data into separate columns as:
JSON in a file as follows:[xyz@innolx20122 ~]$ cat cgs_test.json
{"technology":"AAA","vendor":"XXX","name":"RBNI","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RBNI\", \"temporal_unit\": \"hour\", \"technology\": \"LTE\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RBNI\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"ZTE\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}
{"technology":"AAA","vendor":"XXX","name":"RRCADD","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RRCADD\", \"temporal_unit\": \"hour\", \"technology\": \"AAA\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RRCADD\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"XXX\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}and I have loaded this to one temp table:
create table temp_json (values text);
\copy temp_json from '/home/xyz/cgs_test.json';Now I want to extract this data into separate columns as:
technology
vendor
name
temporal_unit
regional_unit
dataset_metadata
rkdataset_metadata is a JSON column and other columns are strings.Solution
To answer this question what I did was the following (see fiddle here):
This was quite a learning experience for me (+1 for that BTW) so I'll go through the steps in my logic and hopefully this will help you - but I'm doing it for myself really :-)
Step 1:
Step 2:
Populate it with your two records. However, your two records aren't proper JSON - they need a
There may be a way of not having to do this with a different
Then, turn the strings into JSON as follows - this can be done in bulk:
Step 3:
Create another table:
Step 4:
Populate this by:
The
Then, clear down
technology vendor name temporal_unit regional_unit dataset_metadata rk
AAA XXX RBNI hour cell "{\"name\": \"RBNI\", \"temporal_unit\": \"hour\", \"technology\": \"LTE\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RBNI\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"ZTE\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}" 1
AAA XXX RRCADD hour cell "{\"name\": \"RRCADD\", \"temporal_unit\": \"hour\", \"technology\": \"AAA\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RRCADD\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \
This was quite a learning experience for me (+1 for that BTW) so I'll go through the steps in my logic and hopefully this will help you - but I'm doing it for myself really :-)
Step 1:
CREATE TABLE temp_1
(
json_input TEXT NOT NULL
);Step 2:
Populate it with your two records. However, your two records aren't proper JSON - they need a
[ at the beginning and ] at the end in order for them to be proper JSON - so, I've put them into the INSERT at the very beginning of the process - I leave it up to the reader to experiment with what happens if you leave them out - you can put them in later in the process - see below.There may be a way of not having to do this with a different
JSON function - beyond my pay grade I'm afraid - but I'd appreciate any suggestions for improvements.INSERT INTO temp_1 VALUES -- strings copied from the question
('{"technology":"AAA","vendor":"XXX","name":"RBNI","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RBNI\", \"temporal_unit\": \"hour\", \"technology\": \"LTE\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RBNI\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"ZTE\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}'),
('{"technology":"AAA","vendor":"XXX","name":"RRCADD","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RRCADD\", \"temporal_unit\": \"hour\", \"technology\": \"AAA\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RRCADD\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"XXX\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}');Then, turn the strings into JSON as follows - this can be done in bulk:
--
-- This is to turn the "ordinary" strings into JSON per record. If this isn't
-- done, the process will fail further down. Will work for an arbitrary number
-- of records
--
UPDATE temp_1
SET json_input = '[' || json_input || ']';Step 3:
Create another table:
CREATE TABLE temp_2
(
final_json JSON NOT NULL
);Step 4:
Populate this by:
INSERT INTO temp_2 SELECT json_input::JSON FROM temp_1;The
::JSON cast is necessary, otherwise the INSERT will fail as json_input is of type TEXT.Then, clear down
temp_1':
--
-- Clear down temp_1.
-- This is needed for next batch, otherwise you'll have strings beginning
-- with '[[' and then '[[[' (endings same with multiple ']'s) and so on!
--
DELETE FROM temp_1;
Step 5:
Create a table to hold the data:
CREATE TABLE json_table
(
technology TEXT,
vendor TEXT,
name TEXT,
temporal_unit TEXT,
regional_unit TEXT,
dataset_metadata JSON,
rk SMALLINT
);
Step 6:
From here, I obtained this method to populate the table:
INSERT INTO json_table
SELECT (json_populate_recordset(null::json_table, final_json)).* FROM temp_2;
There's another way shown in the fiddle - from here - YMMV?
Step 7 - final check:
SELECT * FROM json_table;
Result:
``technology vendor name temporal_unit regional_unit dataset_metadata rk
AAA XXX RBNI hour cell "{\"name\": \"RBNI\", \"temporal_unit\": \"hour\", \"technology\": \"LTE\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RBNI\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"ZTE\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}" 1
AAA XXX RRCADD hour cell "{\"name\": \"RRCADD\", \"temporal_unit\": \"hour\", \"technology\": \"AAA\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RRCADD\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \
Code Snippets
CREATE TABLE temp_1
(
json_input TEXT NOT NULL
);INSERT INTO temp_1 VALUES -- strings copied from the question
('{"technology":"AAA","vendor":"XXX","name":"RBNI","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RBNI\", \"temporal_unit\": \"hour\", \"technology\": \"LTE\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RBNI\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"ZTE\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}'),
('{"technology":"AAA","vendor":"XXX","name":"RRCADD","temporal_unit":"hour","regional_unit":"cell","dataset_metadata":"{\"name\": \"RRCADD\", \"temporal_unit\": \"hour\", \"technology\": \"AAA\", \"is_dimension\": false, \"timestamp_column_pattern\": \"yyyyMMddHHmmss\", \"data_type\": \"PM\", \"source_name\": \"RRCADD\", \"intervals_epoch_seconds\": [[1609941600, 1609945200]], \"identifier_column_names\": [\"CELLID\", \"CELLNAME\", \"radio_frequency_band\", \"ENODEBID\", \"ENODEBNAME\", \"SBNID\", \"SITEID\", \"SITENAME\", \"CLUSTER_PRIORITY\", \"CP_SITE\", \"IBC\", \"NETWORK_TIER\", \"SITE_TYPE\", \"T3_FLAG\", \"CLUSTERID\", \"CLUSTERNAME\", \"REGION\", \"NETWORK\"], \"vendor\": \"XXX\", \"timestamp_column_name\": \"COLLECTTIME\", \"regional_unit\": \"cell\"}","rk":1}');--
-- This is to turn the "ordinary" strings into JSON per record. If this isn't
-- done, the process will fail further down. Will work for an arbitrary number
-- of records
--
UPDATE temp_1
SET json_input = '[' || json_input || ']';CREATE TABLE temp_2
(
final_json JSON NOT NULL
);INSERT INTO temp_2 SELECT json_input::JSON FROM temp_1;Context
StackExchange Database Administrators Q#282727, answer score: 5
Revisions (0)
No revisions yet.