By examining the range of values over a dataset, we can compute the appropriate histogram bin size, while capturing the data at the same time. We can then pipe the output to another query to display a histogram.
source: airports is duckdb.table('../data/airports.parquet') extend { measure: airport_count is count() # bar_chart view: by_elevation is { aggregate: bin_size is (max(elevation) - min(elevation)) / 30 nest: data is { group_by: elevation aggregate: row_count is count() } } -> { group_by: elevation is floor(data.elevation / bin_size) * nullif(bin_size, 0) + bin_size / 2 aggregate: airport_count is data.row_count.sum() order_by: elevation } }
Overall elevation distribution
The query can be used to show the overall distribution of the data.
We are showing the bin_size in this exmaple for clarity.
run: airports -> by_elevation
[ { "elevation": -210.86666666666667, "airport_count": 12 }, { "elevation": 210.86666666666667, "airport_count": 6230 }, { "elevation": 632.6, "airport_count": 5015 }, { "elevation": 1054.3333333333335, "airport_count": 3858 }, { "elevation": 1476.0666666666666, "airport_count": 1362 }, { "elevation": 1897.8000000000002, "airport_count": 625 }, { "elevation": 2319.5333333333338, "airport_count": 446 }, { "elevation": 2741.266666666667, "airport_count": 333 }, { "elevation": 3163, "airport_count": 244 }, { "elevation": 3584.7333333333336, "airport_count": 227 }, { "elevation": 4006.466666666667, "airport_count": 225 }, { "elevation": 4428.200000000001, "airport_count": 240 }, { "elevation": 4849.933333333333, "airport_count": 248 }, { "elevation": 5271.666666666667, "airport_count": 178 }, { "elevation": 5693.400000000001, "airport_count": 128 }, { "elevation": 6115.133333333333, "airport_count": 108 }, { "elevation": 6536.866666666667, "airport_count": 96 }, { "elevation": 6958.6, "airport_count": 70 }, { "elevation": 7380.333333333334, "airport_count": 40 }, { "elevation": 7802.0666666666675, "airport_count": 33 }, { "elevation": 8223.8, "airport_count": 23 }, { "elevation": 8645.533333333335, "airport_count": 12 }, { "elevation": 9067.266666666666, "airport_count": 10 }, { "elevation": 9489, "airport_count": 8 }, { "elevation": 9910.733333333334, "airport_count": 11 }, { "elevation": 10332.466666666667, "airport_count": 3 }, { "elevation": 10754.2, "airport_count": 5 }, { "elevation": 11175.933333333334, "airport_count": 2 }, { "elevation": 12441.133333333333, "airport_count": 1 } ]
WITH __stage0 AS ( SELECT group_set, ((CASE WHEN group_set=0 THEN max(base."elevation") END-CASE WHEN group_set=0 THEN min(base."elevation") END))*1.0/30 as "bin_size__0", CASE WHEN group_set=1 THEN base."elevation" END as "elevation__1", CASE WHEN group_set=1 THEN COUNT(1) END as "row_count__1" FROM '../data/airports.parquet' as base CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set ) as group_set GROUP BY 1,3 ) , __stage1 AS ( SELECT MAX(CASE WHEN group_set=0 THEN "bin_size__0" END) as "bin_size", COALESCE(LIST({ "elevation": "elevation__1", "row_count": "row_count__1"} ORDER BY "row_count__1" desc NULLS LAST) FILTER (WHERE group_set=1),[]) as "data" FROM __stage0 ) SELECT ((FLOOR(data_0."elevation"*1.0/base."bin_size"))*(NULLIF(base."bin_size",0)))+(base."bin_size"*1.0/2) as "elevation", COALESCE(SUM(data_0."row_count"),0) as "airport_count" FROM __stage1 as base LEFT JOIN LATERAL (SELECT UNNEST(base."data"), 1 as ignoreme) as data_0_outer(data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 1 ASC NULLS LAST
Distribution Adapts Automatically
Notice that when we look at Florida, the bin-width is different.
run: airports extend { where: state = 'FL' } -> by_elevation
[ { "elevation": 5.566666666666666, "airport_count": 137 }, { "elevation": 16.7, "airport_count": 134 }, { "elevation": 27.833333333333332, "airport_count": 87 }, { "elevation": 38.96666666666667, "airport_count": 40 }, { "elevation": 50.099999999999994, "airport_count": 55 }, { "elevation": 61.233333333333334, "airport_count": 45 }, { "elevation": 72.36666666666666, "airport_count": 50 }, { "elevation": 83.5, "airport_count": 46 }, { "elevation": 94.63333333333333, "airport_count": 57 }, { "elevation": 105.76666666666665, "airport_count": 31 }, { "elevation": 116.89999999999999, "airport_count": 32 }, { "elevation": 128.03333333333333, "airport_count": 25 }, { "elevation": 139.16666666666666, "airport_count": 16 }, { "elevation": 150.29999999999998, "airport_count": 27 }, { "elevation": 161.43333333333334, "airport_count": 13 }, { "elevation": 172.56666666666666, "airport_count": 6 }, { "elevation": 183.7, "airport_count": 5 }, { "elevation": 194.83333333333331, "airport_count": 14 }, { "elevation": 205.96666666666664, "airport_count": 4 }, { "elevation": 217.1, "airport_count": 6 }, { "elevation": 228.23333333333332, "airport_count": 3 }, { "elevation": 239.36666666666665, "airport_count": 4 }, { "elevation": 250.5, "airport_count": 6 }, { "elevation": 261.6333333333333, "airport_count": 3 }, { "elevation": 272.76666666666665, "airport_count": 2 }, { "elevation": 283.9, "airport_count": 3 }, { "elevation": 295.0333333333333, "airport_count": 3 }, { "elevation": 328.43333333333334, "airport_count": 1 }, { "elevation": 339.56666666666666, "airport_count": 1 } ]
WITH __stage0 AS ( SELECT group_set, ((CASE WHEN group_set=0 THEN max(base."elevation") END-CASE WHEN group_set=0 THEN min(base."elevation") END))*1.0/30 as "bin_size__0", CASE WHEN group_set=1 THEN base."elevation" END as "elevation__1", CASE WHEN group_set=1 THEN COUNT(1) END as "row_count__1" FROM '../data/airports.parquet' as base CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set ) as group_set WHERE (base."state"='FL') AND ((group_set NOT IN (1) OR (group_set IN (1) AND base."state"='FL'))) GROUP BY 1,3 ) , __stage1 AS ( SELECT MAX(CASE WHEN group_set=0 THEN "bin_size__0" END) as "bin_size", COALESCE(LIST({ "elevation": "elevation__1", "row_count": "row_count__1"} ORDER BY "row_count__1" desc NULLS LAST) FILTER (WHERE group_set=1),[]) as "data" FROM __stage0 ) SELECT ((FLOOR(data_0."elevation"*1.0/base."bin_size"))*(NULLIF(base."bin_size",0)))+(base."bin_size"*1.0/2) as "elevation", COALESCE(SUM(data_0."row_count"),0) as "airport_count" FROM __stage1 as base LEFT JOIN LATERAL (SELECT UNNEST(base."data"), 1 as ignoreme) as data_0_outer(data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 1 ASC NULLS LAST
Elevation within states
This binning even adapts when the queries are nested.
Notice that all the binning is local to the individual states. For example the bottom bin in Colorado starts at an elevation of 3000ish feet.
run: airports -> { group_by: state is state aggregate: airport_count nest: by_elevation }
[ { "state": "TX", "airport_count": 1845, "by_elevation": [ { "elevation": 83.26666666666667, "airport_count": 423 }, { "elevation": 249.8, "airport_count": 127 }, { "elevation": 416.3333333333333, "airport_count": 197 }, { "elevation": 582.8666666666667, "airport_count": 284 }, { "elevation": 749.4, "airport_count": 213 }, { "elevation": 915.9333333333333, "airport_count": 112 }, { "elevation": 1082.4666666666667, "airport_count": 68 }, { "elevation": 1249, "airport_count": 42 }, { "elevation": 1415.5333333333333, "airport_count": 49 }, { "elevation": 1582.0666666666666, "airport_count": 31 }, { "elevation": 1748.6, "airport_count": 42 }, { "elevation": 1915.1333333333332, "airport_count": 24 }, { "elevation": 2081.666666666667, "airport_count": 16 }, { "elevation": 2248.2000000000003, "airport_count": 18 }, { "elevation": 2414.7333333333336, "airport_count": 20 }, { "elevation": 2581.266666666667, "airport_count": 17 }, { "elevation": 2747.8, "airport_count": 16 }, { "elevation": 2914.3333333333335, "airport_count": 10 }, { "elevation": 3080.866666666667, "airport_count": 17 }, { "elevation": 3247.4, "airport_count": 25 }, { "elevation": 3413.9333333333334, "airport_count": 17 }, { "elevation": 3580.4666666666667, "airport_count": 22 }, { "elevation": 3747, "airport_count": 23 }, { "elevation": 3913.5333333333333, "airport_count": 12 }, { "elevation": 4080.066666666667, "airport_count": 4 }, { "elevation": 4246.599999999999, "airport_count": 2 }, { "elevation": 4413.133333333333, "airport_count": 4 }, { "elevation": 4579.666666666666, "airport_count": 6 }, { "elevation": 4746.2, "airport_count": 2 }, { "elevation": 4912.733333333333, "airport_count": 1 }, { "elevation": 5079.266666666666, "airport_count": 1 } ] }, { "state": "CA", "airport_count": 984, "by_elevation": [ { "elevation": -122.3, "airport_count": 10 }, { "elevation": 122.3, "airport_count": 346 }, { "elevation": 366.9, "airport_count": 169 }, { "elevation": 611.5, "airport_count": 73 }, { "elevation": 856.0999999999999, "airport_count": 57 }, { "elevation": 1100.7, "airport_count": 44 }, { "elevation": 1345.3, "airport_count": 37 }, { "elevation": 1589.8999999999999, "airport_count": 26 }, { "elevation": 1834.5, "airport_count": 16 }, { "elevation": 2079.1, "airport_count": 13 }, { "elevation": 2323.7000000000003, "airport_count": 31 }, { "elevation": 2568.3, "airport_count": 32 }, { "elevation": 2812.9, "airport_count": 23 }, { "elevation": 3057.5, "airport_count": 17 }, { "elevation": 3302.1, "airport_count": 15 }, { "elevation": 3546.7000000000003, "airport_count": 7 }, { "elevation": 3791.3, "airport_count": 8 }, { "elevation": 4035.9, "airport_count": 10 }, { "elevation": 4280.5, "airport_count": 8 }, { "elevation": 4525.1, "airport_count": 9 }, { "elevation": 5014.3, "airport_count": 7 }, { "elevation": 5258.9, "airport_count": 4 }, { "elevation": 5503.5, "airport_count": 1 }, { "elevation": 5748.1, "airport_count": 4 }, { "elevation": 5992.7, "airport_count": 1 }, { "elevation": 6237.3, "airport_count": 4 }, { "elevation": 6481.9, "airport_count": 3 }, { "elevation": 6726.5, "airport_count": 6 }, { "elevation": 6971.1, "airport_count": 2 }, { "elevation": 7215.7, "airport_count": 1 } ] }, { "state": "IL", "airport_count": 890, "by_elevation": [ { "elevation": 310, "airport_count": 1 }, { "elevation": 334.8, "airport_count": 5 }, { "elevation": 359.59999999999997, "airport_count": 4 }, { "elevation": 384.4, "airport_count": 5 }, { "elevation": 409.2, "airport_count": 15 }, { "elevation": 434, "airport_count": 26 }, { "elevation": 458.8, "airport_count": 30 }, { "elevation": 483.59999999999997, "airport_count": 14 }, { "elevation": 508.4, "airport_count": 17 }, { "elevation": 533.2, "airport_count": 22 }, { "elevation": 558, "airport_count": 22 }, { "elevation": 582.8, "airport_count": 47 }, { "elevation": 607.6, "airport_count": 52 }, { "elevation": 632.4, "airport_count": 79 }, { "elevation": 657.2, "airport_count": 90 }, { "elevation": 682, "airport_count": 67 }, { "elevation": 706.8, "airport_count": 69 }, { "elevation": 731.6, "airport_count": 71 }, { "elevation": 756.4, "airport_count": 50 }, { "elevation": 781.2, "airport_count": 46 }, { "elevation": 806, "airport_count": 41 }, { "elevation": 830.8, "airport_count": 26 }, { "elevation": 855.6, "airport_count": 27 }, { "elevation": 880.4, "airport_count": 20 }, { "elevation": 905.2, "airport_count": 17 }, { "elevation": 930, "airport_count": 9 }, { "elevation": 954.8, "airport_count": 8 }, { "elevation": 979.6, "airport_count": 7 }, { "elevation": 1004.4, "airport_count": 1 }, { "elevation": 1054.0000000000002, "airport_count": 2 } ] }, { "state": "FL", "airport_count": 856, "by_elevation": [ { "elevation": 5.566666666666666, "airport_count": 137 }, { "elevation": 16.7, "airport_count": 134 }, { "elevation": 27.833333333333332, "airport_count": 87 }, { "elevation": 38.96666666666667, "airport_count": 40 }, { "elevation": 50.099999999999994, "airport_count": 55 }, { "elevation": 61.233333333333334, "airport_count": 45 }, { "elevation": 72.36666666666666, "airport_count": 50 }, { "elevation": 83.5, "airport_count": 46 }, { "elevation": 94.63333333333333, "airport_count": 57 }, { "elevation": 105.76666666666665, "airport_count": 31 }, { "elevation": 116.89999999999999, "airport_count": 32 }, { "elevation": 128.03333333333333, "airport_count": 25 }, { "elevation": 139.16666666666666, "airport_count": 16 }, { "elevation": 150.29999999999998, "airport_count": 27 }, { "elevation": 161.43333333333334, "airport_count": 13 }, { "elevation": 172.56666666666666, "airport_count": 6 }, { "elevation": 183.7, "airport_count": 5 }, { "elevation": 194.83333333333331, "airport_count": 14 }, { "elevation": 205.96666666666664, "airport_count": 4 }, { "elevation": 217.1, "airport_count": 6 }, { "elevation": 228.23333333333332, "airport_count": 3 }, { "elevation": 239.36666666666665, "airport_count": 4 }, { "elevation": 250.5, "airport_count": 6 }, { "elevation": 261.6333333333333, "airport_count": 3 }, { "elevation": 272.76666666666665, "airport_count": 2 }, { "elevation": 283.9, "airport_count": 3 }, { "elevation": 295.0333333333333, "airport_count": 3 }, { "elevation": 328.43333333333334, "airport_count": 1 }, { "elevation": 339.56666666666666, "airport_count": 1 } ] }, { "state": "PA", "airport_count": 804, "by_elevation": [ { "elevation": 75.01666666666667, "airport_count": 28 }, { "elevation": 225.05, "airport_count": 53 }, { "elevation": 375.0833333333333, "airport_count": 106 }, { "elevation": 525.1166666666667, "airport_count": 147 }, { "elevation": 675.15, "airport_count": 67 }, { "elevation": 825.1833333333333, "airport_count": 46 }, { "elevation": 975.2166666666667, "airport_count": 51 }, { "elevation": 1125.25, "airport_count": 85 }, { "elevation": 1275.2833333333333, "airport_count": 80 }, { "elevation": 1425.3166666666666, "airport_count": 45 }, { "elevation": 1575.35, "airport_count": 27 }, { "elevation": 1725.3833333333332, "airport_count": 25 }, { "elevation": 1875.4166666666667, "airport_count": 11 }, { "elevation": 2025.45, "airport_count": 16 }, { "elevation": 2175.4833333333336, "airport_count": 5 }, { "elevation": 2325.516666666667, "airport_count": 8 }, { "elevation": 2475.55, "airport_count": 2 }, { "elevation": 2925.65, "airport_count": 1 }, { "elevation": 4576.016666666666, "airport_count": 1 } ] } ]
WITH __stage0 AS ( SELECT group_set, base."state" as "state__0", CASE WHEN group_set=0 THEN COUNT(1) END as "airport_count__0", ((CASE WHEN group_set=1 THEN max(base."elevation") END-CASE WHEN group_set=1 THEN min(base."elevation") END))*1.0/30 as "bin_size__1", CASE WHEN group_set=2 THEN base."elevation" END as "elevation__2", CASE WHEN group_set=2 THEN COUNT(1) END as "row_count__2" FROM '../data/airports.parquet' as base CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,2,1)) as group_set ) as group_set GROUP BY 1,2,5 ) , __stage1 AS ( SELECT CASE WHEN group_set=2 THEN 1 ELSE group_set END as group_set, "state__0" as "state__0", FIRST("airport_count__0") FILTER (WHERE "airport_count__0" IS NOT NULL) as "airport_count__0", FIRST("bin_size__1") FILTER (WHERE "bin_size__1" IS NOT NULL) as "bin_size__1", COALESCE(LIST({ "elevation": "elevation__2", "row_count": "row_count__2"} ORDER BY "row_count__2" desc NULLS LAST) FILTER (WHERE group_set=2),[]) as "data__1" FROM __stage0 GROUP BY 1,2 ) SELECT "state__0" as "state", MAX(CASE WHEN group_set=0 THEN "airport_count__0" END) as "airport_count", (WITH __stage0 AS ( SELECT ((FLOOR(data_0."elevation"*1.0/base."bin_size"))*(NULLIF(base."bin_size",0)))+(base."bin_size"*1.0/2) as "elevation", COALESCE(SUM(data_0."row_count"),0) as "airport_count" FROM (SELECT UNNEST([COALESCE(FIRST({"bin_size": "bin_size__1" , "data": "data__1" }) FILTER(WHERE group_set=1), {"bin_size": NULL, "data": NULL})]) as base) as base LEFT JOIN LATERAL (SELECT UNNEST(base."data"), 1 as ignoreme) as data_0_outer(data_0,ignoreme) ON data_0_outer.ignoreme=1 GROUP BY 1 ORDER BY 1 ASC NULLS LAST ) SELECT LIST(STRUCT_PACK("elevation","airport_count") ORDER BY elevation asc) FROM __stage0 ) as "by_elevation" FROM __stage1 GROUP BY 1 ORDER BY 2 desc NULLS LAST