Malloy allows you to compute sums and averages correctly based on your join tree. Fan-outs based on join relationships will never impact the correctness of these aggregations. This example has flights
, joining to aircraft
, joining to aircraft_model``.
aircraft_model` has the number of seats specified on this model of aircraft. Code below computes sums and averages at various places in the join tree.
// join 3 tables, flights, aircraft and aircraft models. // `flights` is individual flights // `aircraft` is the plane that made the flight // `aircraft_models` is data about the kind of aircraft source: aircraft_models is duckdb.table('../data/aircraft_models.parquet') extend { primary_key: aircraft_model_code } source: aircraft is duckdb.table('../data/aircraft.parquet') extend { primary_key: tail_num join_one: aircraft_models with aircraft_model_code } source: flights is duckdb.table('../data/flights.parquet') extend { join_one: aircraft with tail_num } run: flights -> { where: dep_time = @2003-01 group_by: carrier aggregate: // number of flights flight_count is count() // number of planes aircraft_count is aircraft.count() // number of different aircraft_models aircraft_model_count is aircraft.aircraft_models.count() // count each seat once for each flight. seats_for_sale is sum(aircraft.aircraft_models.seats) // count the seat once for each plane seats_on_all_planes is aircraft.sum(aircraft.aircraft_models.seats) // average number of seats on each model by model average_seats_per_model is aircraft.aircraft_models.seats.avg() }
[ { "carrier": "WN", "flight_count": 1209, "aircraft_count": 6, "aircraft_model_count": 2, "seats_for_sale": 175362, "seats_on_all_planes": 867, "average_seats_per_model": 144.5 }, { "carrier": "EV", "flight_count": 547, "aircraft_count": 3, "aircraft_model_count": 3, "seats_for_sale": 35896, "seats_on_all_planes": 184, "average_seats_per_model": 61.333333333333336 }, { "carrier": "UA", "flight_count": 524, "aircraft_count": 6, "aircraft_model_count": 5, "seats_for_sale": 107646, "seats_on_all_planes": 1357, "average_seats_per_model": 231.4 }, { "carrier": "AA", "flight_count": 422, "aircraft_count": 6, "aircraft_model_count": 4, "seats_for_sale": 73492, "seats_on_all_planes": 945, "average_seats_per_model": 150.25 }, { "carrier": "NW", "flight_count": 419, "aircraft_count": 5, "aircraft_model_count": 5, "seats_for_sale": 78528, "seats_on_all_planes": 914, "average_seats_per_model": 182.8 } ]
SELECT base."carrier" as "carrier", COUNT(1) as "flight_count", COUNT(DISTINCT aircraft_0."tail_num") as "aircraft_count", COUNT(DISTINCT aircraft_models_0."aircraft_model_code") as "aircraft_model_count", COALESCE(SUM(aircraft_models_0."seats"),0) as "seats_for_sale", COALESCE(( SELECT SUM(a.val) as value FROM ( SELECT UNNEST(list(distinct {key:aircraft_0."tail_num", val: aircraft_models_0."seats"})) a ) ),0) as "seats_on_all_planes", ( SELECT AVG(a.val) as value FROM ( SELECT UNNEST(list(distinct {key:aircraft_models_0."aircraft_model_code", val: aircraft_models_0."seats"})) a ) ) as "average_seats_per_model" FROM '../data/flights.parquet' as base LEFT JOIN '../data/aircraft.parquet' AS aircraft_0 ON aircraft_0."tail_num"=base."tail_num" LEFT JOIN '../data/aircraft_models.parquet' AS aircraft_models_0 ON aircraft_models_0."aircraft_model_code"=aircraft_0."aircraft_model_code" WHERE (base."dep_time">=TIMESTAMP '2003-01-01 00:00:00') and (base."dep_time"<TIMESTAMP '2003-02-01 00:00:00') GROUP BY 1 ORDER BY 2 desc NULLS LAST