Often you want to limit the number of group-by values in a table, and bucket everything else into an 'other' category.
In the top_states_by_eleveation
query below, we have a query with two stages. The first stage calculates the top states and nests the data to be aggregated. The second pipeline stage produces the actual aggregation.
source: airports is duckdb.table('../data/airports.parquet') extend { measure: airport_count is count() avg_elevation is elevation.avg() view: top_states_by_elevation is { group_by: state aggregate: avg_elevation calculate: is_other is row_number() > 5 nest: data is { group_by: code, elevation } } -> { group_by: state is pick state when not is_other else 'OTHER' aggregate: avg_elevation is data.elevation.avg() airport_count is data.count() # hidden sort is data.count() {where: not is_other } order_by: sort desc } }
Basic Query
run: airports -> top_states_by_elevation
[ { "state": "CO", "avg_elevation": 6255.863529411765, "airport_count": 425, "sort": 425 }, { "state": "NM", "avg_elevation": 5419.635359116022, "airport_count": 181, "sort": 181 }, { "state": "UT", "avg_elevation": 5066, "airport_count": 140, "sort": 140 }, { "state": "NV", "avg_elevation": 4029.9765625, "airport_count": 128, "sort": 128 }, { "state": "WY", "avg_elevation": 5619.365217391304, "airport_count": 115, "sort": 115 }, { "state": "OTHER", "avg_elevation": 910.3459902148479, "airport_count": 18781, "sort": 0 } ]
WITH __stage0 AS ( SELECT group_set, base."state" as "state__0", CASE WHEN group_set=0 THEN AVG(base."elevation") END as "avg_elevation__0", (CASE WHEN group_set=0 THEN ROW_NUMBER() OVER(PARTITION BY group_set ORDER BY CASE WHEN group_set=0 THEN AVG(base."elevation") END desc NULLS LAST ) END)>5 as "is_other__0", CASE WHEN group_set=1 THEN base."code" END as "code__1", CASE WHEN group_set=1 THEN base."elevation" END as "elevation__1" FROM '../data/airports.parquet' as base CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set ) as group_set GROUP BY 1,2,5,6 ) , __stage1 AS ( SELECT "state__0" as "state", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", MAX(CASE WHEN group_set=0 THEN "is_other__0" END) as "is_other", COALESCE(LIST({ "code": "code__1", "elevation": "elevation__1"} ORDER BY "code__1" asc NULLS LAST) FILTER (WHERE group_set=1),[]) as "data" FROM __stage0 GROUP BY 1 ) SELECT CASE WHEN COALESCE(NOT base."is_other",TRUE) THEN base."state" ELSE 'OTHER' END as "state", AVG(data_0."elevation") as "avg_elevation", COUNT(DISTINCT base."state" || 'x' || data_0_outer.__row_id) as "airport_count", COUNT(DISTINCT CASE WHEN COALESCE(NOT base."is_other",TRUE) THEN base."state" || 'x' || data_0_outer.__row_id END) as "sort" FROM __stage1 as base LEFT JOIN LATERAL (SELECT UNNEST(GENERATE_SERIES(1, length(base."data"),1)) as __row_id, UNNEST(base."data"), 1 as ignoreme) as data_0_outer(__row_id, data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 4 desc NULLS LAST
Nested Query
run: airports -> { group_by: `Facility Type` is fac_type aggregate: airport_count avg_elevation nest: top_states_by_elevation }
[ { "Facility Type": "AIRPORT", "airport_count": 13925, "avg_elevation": 1237.0441651705567, "top_states_by_elevation": [ { "state": "OTHER", "avg_elevation": 1017.9730281051677, "airport_count": 13236, "sort": 0 }, { "state": "CO", "avg_elevation": 5873.0642570281125, "airport_count": 249, "sort": 249 }, { "state": "NM", "avg_elevation": 5467.63870967742, "airport_count": 155, "sort": 155 }, { "state": "UT", "avg_elevation": 5257.18556701031, "airport_count": 97, "sort": 97 }, { "state": "NV", "avg_elevation": 4384.041237113402, "airport_count": 97, "sort": 97 }, { "state": "WY", "avg_elevation": 5570.0329670329675, "airport_count": 91, "sort": 91 } ] }, { "Facility Type": "HELIPORT", "airport_count": 5135, "avg_elevation": 950.5125608568646, "top_states_by_elevation": [ { "state": "OTHER", "avg_elevation": 655.6607547948031, "airport_count": 4849, "sort": 0 }, { "state": "CO", "avg_elevation": 6816.569696969697, "airport_count": 165, "sort": 165 }, { "state": "UT", "avg_elevation": 4634.720930232558, "airport_count": 43, "sort": 43 }, { "state": "MT", "avg_elevation": 3756.310344827586, "airport_count": 29, "sort": 29 }, { "state": "NM", "avg_elevation": 5170.76, "airport_count": 25, "sort": 25 }, { "state": "WY", "avg_elevation": 5806.416666666667, "airport_count": 24, "sort": 24 } ] }, { "Facility Type": "SEAPLANE BASE", "airport_count": 473, "avg_elevation": 488.82241014799155, "top_states_by_elevation": [ { "state": "NE", "avg_elevation": 1946, "airport_count": 1, "sort": 1 }, { "state": "NM", "avg_elevation": 4201, "airport_count": 1, "sort": 1 }, { "state": "SD", "avg_elevation": 1600, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 448.49892008639307, "airport_count": 463, "sort": 0 }, { "state": "ID", "avg_elevation": 1884.6, "airport_count": 5, "sort": 5 }, { "state": "MT", "avg_elevation": 3194, "airport_count": 2, "sort": 2 } ] }, { "Facility Type": "ULTRALIGHT", "airport_count": 125, "avg_elevation": 806.144, "top_states_by_elevation": [ { "state": "CO", "avg_elevation": 8110, "airport_count": 1, "sort": 1 }, { "state": "MT", "avg_elevation": 3540, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 617.9285714285714, "airport_count": 112, "sort": 0 }, { "state": "AZ", "avg_elevation": 2025.7142857142858, "airport_count": 7, "sort": 7 }, { "state": "CA", "avg_elevation": 1335, "airport_count": 2, "sort": 2 }, { "state": "NY", "avg_elevation": 1530, "airport_count": 2, "sort": 2 } ] }, { "Facility Type": "STOLPORT", "airport_count": 86, "avg_elevation": 1375.046511627907, "top_states_by_elevation": [ { "state": "NV", "avg_elevation": 4900, "airport_count": 1, "sort": 1 }, { "state": "AZ", "avg_elevation": 4772, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 752.418918918919, "airport_count": 74, "sort": 0 }, { "state": "CO", "avg_elevation": 6211.666666666667, "airport_count": 6, "sort": 6 }, { "state": "CA", "avg_elevation": 4040.5, "airport_count": 2, "sort": 2 }, { "state": "MT", "avg_elevation": 3776, "airport_count": 2, "sort": 2 } ] }, { "Facility Type": "GLIDERPORT", "airport_count": 37, "avg_elevation": 1611.4054054054054, "top_states_by_elevation": [ { "state": "AZ", "avg_elevation": 3539, "airport_count": 2, "sort": 2 }, { "state": "NV", "avg_elevation": 4300, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 737.5185185185185, "airport_count": 27, "sort": 0 }, { "state": "CO", "avg_elevation": 7061.666666666667, "airport_count": 3, "sort": 3 }, { "state": "KS", "avg_elevation": 2088.5, "airport_count": 2, "sort": 2 }, { "state": "NY", "avg_elevation": 1484.5, "airport_count": 2, "sort": 2 } ] }, { "Facility Type": "BALLOONPORT", "airport_count": 12, "avg_elevation": 1047.25, "top_states_by_elevation": [ { "state": "KS", "avg_elevation": 1250, "airport_count": 1, "sort": 1 }, { "state": "OH", "avg_elevation": 1164, "airport_count": 1, "sort": 1 }, { "state": "CO", "avg_elevation": 5050, "airport_count": 1, "sort": 1 }, { "state": "MI", "avg_elevation": 980, "airport_count": 1, "sort": 1 }, { "state": "OTHER", "avg_elevation": 416.6666666666667, "airport_count": 6, "sort": 0 }, { "state": "IL", "avg_elevation": 811.5, "airport_count": 2, "sort": 2 } ] } ]
WITH __stage0 AS ( SELECT group_set, base."fac_type" as "Facility Type__0", CASE WHEN group_set=0 THEN COUNT(1) END as "airport_count__0", CASE WHEN group_set=0 THEN AVG(base."elevation") END as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN base."state" END as "state__1", CASE WHEN group_set=1 THEN AVG(base."elevation") END as "avg_elevation__1", (CASE WHEN group_set=1 THEN ROW_NUMBER() OVER(PARTITION BY group_set, base."fac_type" ORDER BY CASE WHEN group_set=1 THEN AVG(base."elevation") END desc NULLS LAST ) END)>5 as "is_other__1", CASE WHEN group_set=2 THEN base."code" END as "code__2", CASE WHEN group_set=2 THEN base."elevation" END as "elevation__2" FROM '../data/airports.parquet' as base CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,2,1)) as group_set ) as group_set GROUP BY 1,2,5,8,9 ) , __stage1 AS ( SELECT CASE WHEN group_set=2 THEN 1 ELSE group_set END as group_set, "Facility Type__0" as "Facility Type__0", FIRST("airport_count__0") FILTER (WHERE "airport_count__0" IS NOT NULL) as "airport_count__0", FIRST("avg_elevation__0") FILTER (WHERE "avg_elevation__0" IS NOT NULL) as "avg_elevation__0", CASE WHEN group_set IN (1,2) THEN "state__1" END as "state__1", FIRST("avg_elevation__1") FILTER (WHERE "avg_elevation__1" IS NOT NULL) as "avg_elevation__1", FIRST("is_other__1") FILTER (WHERE "is_other__1" IS NOT NULL) as "is_other__1", COALESCE(LIST({ "code": "code__2", "elevation": "elevation__2"} ORDER BY "code__2" asc NULLS LAST) FILTER (WHERE group_set=2),[]) as "data__1" FROM __stage0 GROUP BY 1,2,5 ) SELECT "Facility Type__0" as "Facility Type", MAX(CASE WHEN group_set=0 THEN "airport_count__0" END) as "airport_count", MAX(CASE WHEN group_set=0 THEN "avg_elevation__0" END) as "avg_elevation", (WITH __stage0 AS ( SELECT CASE WHEN COALESCE(NOT base."is_other",TRUE) THEN base."state" ELSE 'OTHER' END as "state", AVG(data_0."elevation") as "avg_elevation", COUNT(DISTINCT base."__distinct_key" || 'x' || data_0_outer.__row_id) as "airport_count", COUNT(DISTINCT CASE WHEN COALESCE(NOT base."is_other",TRUE) THEN base."__distinct_key" || 'x' || data_0_outer.__row_id END) as "sort" FROM (SELECT GEN_RANDOM_UUID() as "__distinct_key", x.* FROM (SELECT UNNEST(COALESCE(LIST({ "state": "state__1", "avg_elevation": "avg_elevation__1", "is_other": "is_other__1", "data": "data__1"} ORDER BY "avg_elevation__1" desc NULLS LAST) FILTER (WHERE group_set=1),[])) as base) as x) as base LEFT JOIN LATERAL (SELECT UNNEST(GENERATE_SERIES(1, length(base."data"),1)) as __row_id, UNNEST(base."data"), 1 as ignoreme) as data_0_outer(__row_id, data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 4 desc NULLS LAST ) SELECT LIST(STRUCT_PACK("state","avg_elevation","airport_count","sort")) FROM __stage0 ) as "top_states_by_elevation" FROM __stage1 GROUP BY 1 ORDER BY 2 desc NULLS LAST