Malloy Documentation
search

Malloy separates queries into a source and a view on that source. Views can be defined in a source or used directly in queries. Views that are named as part of a source extension can be reused, nested, and refined.

A view consists of one or more stages separated by ->s. Most views only have one stage, but multi-stage views can be used for more complex analysis.

Reduction vs Projection

Each stage of a view performs a transformation of one of two kinds:

  • Reduction: reduce the grain of the data

  • Projection: select fields without reducing

Reductions use group_by: and/or aggregate:, whereas projections use select:.

Reduction

The following is an example of a reduction:

document
run: flights -> {
  group_by: carrier        
  aggregate: flight_count is count()
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 88751
  },
  {
    "carrier": "US",
    "flight_count": 37683
  },
  {
    "carrier": "AA",
    "flight_count": 34577
  },
  {
    "carrier": "NW",
    "flight_count": 33580
  },
  {
    "carrier": "UA",
    "flight_count": 32757
  }
]
SELECT 
   base."carrier" as "carrier",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
GROUP BY 1
ORDER BY 2 desc NULLS LAST

Projection

The following is an example of a projection:

document
run: flights -> {
  select: *
  limit: 20
}
QUERY RESULTS
[
  {
    "arr_delay": -6,
    "arr_time": "2004-11-18T23:09:00.000Z",
    "cancelled": "N",
    "carrier": "US",
    "dep_delay": -3,
    "dep_time": "2004-11-18T22:32:00.000Z",
    "destination_code": "ABE",
    "distance": 55,
    "diverted": "N",
    "flight_num": "1692",
    "flight_time": 15,
    "id2": 30272525,
    "origin_code": "PHL",
    "tail_num": "N806MD",
    "taxi_in": 4,
    "taxi_out": 18
  },
  {
    "arr_delay": 0,
    "arr_time": "2004-10-12T21:28:00.000Z",
    "cancelled": "N",
    "carrier": "US",
    "dep_delay": 6,
    "dep_time": "2004-10-12T20:46:00.000Z",
    "destination_code": "ABE",
    "distance": 55,
    "diverted": "N",
    "flight_num": "1650",
    "flight_time": 18,
    "id2": 29742442,
    "origin_code": "PHL",
    "tail_num": "N806MD",
    "taxi_in": 4,
    "taxi_out": 20
  },
  {
    "arr_delay": 2,
    "arr_time": "2004-11-24T11:14:00.000Z",
    "cancelled": "N",
    "carrier": "US",
    "dep_delay": 0,
    "dep_time": "2004-11-24T10:20:00.000Z",
    "destination_code": "ABE",
    "distance": 55,
    "diverted": "N",
    "flight_num": "1616",
    "flight_time": 19,
    "id2": 30270885,
    "origin_code": "PHL",
    "tail_num": "N816MA",
    "taxi_in": 5,
    "taxi_out": 30
  },
  {
    "arr_delay": -19,
    "arr_time": "2004-08-31T21:06:00.000Z",
    "cancelled": "N",
    "carrier": "US",
    "dep_delay": 0,
    "dep_time": "2004-08-31T20:30:00.000Z",
    "destination_code": "ABE",
    "distance": 55,
    "diverted": "N",
    "flight_num": "1650",
    "flight_time": 17,
    "id2": 28344746,
    "origin_code": "PHL",
    "tail_num": "N806MD",
    "taxi_in": 4,
    "taxi_out": 15
  },
  {
    "arr_delay": -19,
    "arr_time": "2004-07-27T10:59:00.000Z",
    "cancelled": "N",
    "carrier": "US",
    "dep_delay": -4,
    "dep_time": "2004-07-27T10:21:00.000Z",
    "destination_code": "ABE",
    "distance": 55,
    "diverted": "N",
    "flight_num": "1643",
    "flight_time": 17,
    "id2": 27898410,
    "origin_code": "PHL",
    "tail_num": "N806MD",
    "taxi_in": 4,
    "taxi_out": 17
  }
]
SELECT 
   base."arr_delay" as "arr_delay",
   base."arr_time" as "arr_time",
   base."cancelled" as "cancelled",
   base."carrier" as "carrier",
   base."dep_delay" as "dep_delay",
   base."dep_time" as "dep_time",
   base."destination" as "destination_code",
   base."distance" as "distance",
   base."diverted" as "diverted",
   base."flight_num" as "flight_num",
   base."flight_time" as "flight_time",
   base."id2" as "id2",
   base."origin" as "origin_code",
   base."tail_num" as "tail_num",
   base."taxi_in" as "taxi_in",
   base."taxi_out" as "taxi_out"
FROM '../data/flights.parquet' as base
LIMIT 20

Note that the operations in a stage are for the most part not order-sensitive like SQL; they can be arranged in any order.

View Operations

Views can contain a number of different kinds of operations which affect the behavior of the transformation. The following sections describe these various operations.

Fields

In a stage, fields (dimensions, measures, views, and calculations) may be specified either by referencing an existing name or defining them inline.

document
run: flights -> {
  group_by: carrier
  aggregate: flight_count is count()
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 88751
  },
  {
    "carrier": "US",
    "flight_count": 37683
  },
  {
    "carrier": "AA",
    "flight_count": 34577
  },
  {
    "carrier": "NW",
    "flight_count": 33580
  },
  {
    "carrier": "UA",
    "flight_count": 32757
  }
]
SELECT 
   base."carrier" as "carrier",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
GROUP BY 1
ORDER BY 2 desc NULLS LAST

Dimensions are included with group_by (or select in a projection), measures are included with aggregate:, and views are nested with nest:.

When referencing existing fields in a select: clause, wildcard expressions like *, or some_join.* may be used.

See the Fields section for more information about the different kinds of fields and how they can be defined.

Filters

Filters can be included in a view with where:, which is equivalent to SQL's WHERE clause.

document
run: flights -> {
  where: distance > 1000
  group_by: distance
  aggregate: flight_count
}
QUERY RESULTS
[
  {
    "distance": 1050,
    "flight_count": 1156
  },
  {
    "distance": 2288,
    "flight_count": 1153
  },
  {
    "distance": 1009,
    "flight_count": 1093
  },
  {
    "distance": 1235,
    "flight_count": 1093
  },
  {
    "distance": 1020,
    "flight_count": 1084
  }
]
SELECT 
   base."distance" as "distance",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
WHERE base."distance">1000
GROUP BY 1
ORDER BY 2 desc NULLS LAST

Filters may be also be applied to a source, or a measure.

See the Filters section for more information.

See Post-Aggregation Filtering below for information about having:.

Ordering and Limiting

Views may also include ordering and limiting specifications.

document
run: flights -> {
  limit: 4
  group_by: carrier
  aggregate: flight_count
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 88751
  },
  {
    "carrier": "US",
    "flight_count": 37683
  },
  {
    "carrier": "AA",
    "flight_count": 34577
  },
  {
    "carrier": "NW",
    "flight_count": 33580
  }
]
SELECT 
   base."carrier" as "carrier",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
GROUP BY 1
ORDER BY 2 desc NULLS LAST
LIMIT 4

For detailed information on ordering and limiting, see the Ordering and Limiting section.

Post-Aggregation Filtering

Views may filter entire groupings based on aggregate values using the having: clause, which corresponds to SQL's HAVING clause.

document
run: flights -> {
  group_by: carrier
  aggregate: flight_count
  having: flight_count > 35000
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 88751
  },
  {
    "carrier": "US",
    "flight_count": 37683
  }
]
SELECT 
   base."carrier" as "carrier",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
GROUP BY 1
HAVING (COUNT(1))>35000
ORDER BY 2 desc NULLS LAST

Calculations (Window Functions)

Calculations based on other groupings may be performed with the calculate: clause and analytic functions. See the Calculations section for details.

document
run: flights -> {
  group_by: carrier
  aggregate: flight_count
  calculate: flight_count_rank is rank()
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 88751,
    "flight_count_rank": 1
  },
  {
    "carrier": "US",
    "flight_count": 37683,
    "flight_count_rank": 2
  },
  {
    "carrier": "AA",
    "flight_count": 34577,
    "flight_count_rank": 3
  },
  {
    "carrier": "NW",
    "flight_count": 33580,
    "flight_count_rank": 4
  },
  {
    "carrier": "UA",
    "flight_count": 32757,
    "flight_count_rank": 5
  }
]
SELECT 
   base."carrier" as "carrier",
   COUNT(1) as "flight_count",
   RANK() OVER(  ORDER BY  COUNT(1) desc NULLS LAST ) as "flight_count_rank"
FROM '../data/flights.parquet' as base
GROUP BY 1
ORDER BY 2 desc NULLS LAST

Refinements, Query partials and Shorthand

Query blocks write be written in parts, those parts can be either views in the source and combined with the refinement operator +

The query below

run: flights -> {
  group_by: carrier
  aggregate: flight_count
  limit: 10
}

can be written as below. Each of the parts of a query can be separated into a 'partial query'.

run: flights -> 
  {group_by: carrier}
  + {aggregate: flight_count}
  + {limit: 10}

The query can also be written as:

run: flights -> 
  carrier
  + flight_count
  + {limit: 10}

In the query above, the refienment operator + combines the parameters of a query so the query can be built from parts. Measure, dimension and views declared in the source can also be used. Referencing dimension carrier expands to a partial of {group_by: x}. Referencing measure flight_count expands to a partial query of {aggregate: flight_count}

Source Extensions

When writing a query, if additional source extensions are needed, they can be extend-ed into the source in the query expression, as in

run: some_source extend { extensions } -> { view operations }

Such extensions can also be included in a view using the extend: block.

document
run: flights -> {
  extend: {
    join_one: origin_airport is airports on origin_airport.code = origin_code
    dimension: origin_state is origin_airport.state
  }

  group_by: origin_state
  aggregate: flight_count
  limit: 5
}
QUERY RESULTS
[
  {
    "origin_state": "CA",
    "flight_count": 40670
  },
  {
    "origin_state": "TX",
    "flight_count": 40085
  },
  {
    "origin_state": "FL",
    "flight_count": 24242
  },
  {
    "origin_state": "IL",
    "flight_count": 20850
  },
  {
    "origin_state": "GA",
    "flight_count": 20014
  }
]
SELECT 
   origin_airport_0."state" as "origin_state",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
 LEFT JOIN '../data/airports.parquet' AS origin_airport_0
  ON origin_airport_0."code"=base."origin"
GROUP BY 1
ORDER BY 2 desc NULLS LAST
LIMIT 5

When the view is defined inside a source, source extensions can only be added in this way. The legal extensions in a view are:

  • Defining fields with dimension: and measure:

  • Declaring joins with join_one:, join_many:, and join_cross:

Reusing, Nesting, and Refining Views

When views are defined in a source, they can be reused in a number of ways.

Reusing Views

They can serve as the starting point for multiple different queries:

run: flights -> by_carrier -> { select: nickname; limit: 1 }
run: flights -> by_carrier -> { index: * }

Nesting Views

Views can also be nested in other views:

document
run: flights -> {
  group_by: origin.state
  nest: by_carrier
}
QUERY RESULTS
[
  {
    "state": "AK",
    "by_carrier": [
      {
        "nickname": "Alaska",
        "flight_count": 502,
        "destination_count": 5
      },
      {
        "nickname": "Northwest",
        "flight_count": 164,
        "destination_count": 3
      },
      {
        "nickname": "Delta",
        "flight_count": 140,
        "destination_count": 5
      },
      {
        "nickname": "United",
        "flight_count": 55,
        "destination_count": 4
      },
      {
        "nickname": "American",
        "flight_count": 1,
        "destination_count": 1
      }
    ]
  },
  {
    "state": "AL",
    "by_carrier": [
      {
        "nickname": "Atlantic Southeast",
        "flight_count": 910,
        "destination_count": 1
      },
      {
        "nickname": "Southwest",
        "flight_count": 836,
        "destination_count": 12
      },
      {
        "nickname": "Delta",
        "flight_count": 587,
        "destination_count": 4
      },
      {
        "nickname": "Continental Express",
        "flight_count": 232,
        "destination_count": 3
      },
      {
        "nickname": "Comair",
        "flight_count": 103,
        "destination_count": 5
      },
      {
        "nickname": "American",
        "flight_count": 45,
        "destination_count": 1
      },
      {
        "nickname": "USAir",
        "flight_count": 28,
        "destination_count": 1
      },
      {
        "nickname": "Continental",
        "flight_count": 15,
        "destination_count": 1
      }
    ]
  },
  {
    "state": "AR",
    "by_carrier": [
      {
        "nickname": "American Eagle",
        "flight_count": 853,
        "destination_count": 3
      },
      {
        "nickname": "Southwest",
        "flight_count": 364,
        "destination_count": 7
      },
      {
        "nickname": "Continental Express",
        "flight_count": 136,
        "destination_count": 3
      },
      {
        "nickname": "Delta",
        "flight_count": 66,
        "destination_count": 2
      },
      {
        "nickname": "American",
        "flight_count": 65,
        "destination_count": 2
      },
      {
        "nickname": "Atlantic Southeast",
        "flight_count": 44,
        "destination_count": 1
      },
      {
        "nickname": "Comair",
        "flight_count": 40,
        "destination_count": 2
      },
      {
        "nickname": "Northwest",
        "flight_count": 13,
        "destination_count": 1
      }
    ]
  },
  {
    "state": "AZ",
    "by_carrier": [
      {
        "nickname": "Southwest",
        "flight_count": 6912,
        "destination_count": 42
      },
      {
        "nickname": "America West",
        "flight_count": 3819,
        "destination_count": 41
      },
      {
        "nickname": "United",
        "flight_count": 703,
        "destination_count": 7
      },
      {
        "nickname": "American",
        "flight_count": 661,
        "destination_count": 6
      },
      {
        "nickname": "Northwest",
        "flight_count": 543,
        "destination_count": 5
      },
      {
        "nickname": "Alaska",
        "flight_count": 381,
        "destination_count": 3
      },
      {
        "nickname": "USAir",
        "flight_count": 316,
        "destination_count": 3
      },
      {
        "nickname": "Delta",
        "flight_count": 90,
        "destination_count": 5
      },
      {
        "nickname": "ATA",
        "flight_count": 42,
        "destination_count": 2
      },
      {
        "nickname": "Continental",
        "flight_count": 37,
        "destination_count": 3
      },
      {
        "nickname": "Continental Express",
        "flight_count": 27,
        "destination_count": 1
      },
      {
        "nickname": "Jetblue",
        "flight_count": 7,
        "destination_count": 1
      }
    ]
  },
  {
    "state": "CA",
    "by_carrier": [
      {
        "nickname": "Southwest",
        "flight_count": 19755,
        "destination_count": 32
      },
      {
        "nickname": "United",
        "flight_count": 6901,
        "destination_count": 37
      },
      {
        "nickname": "American",
        "flight_count": 4687,
        "destination_count": 31
      },
      {
        "nickname": "Alaska",
        "flight_count": 2618,
        "destination_count": 9
      },
      {
        "nickname": "Northwest",
        "flight_count": 2209,
        "destination_count": 10
      },
      {
        "nickname": "America West",
        "flight_count": 2047,
        "destination_count": 4
      },
      {
        "nickname": "Jetblue",
        "flight_count": 910,
        "destination_count": 9
      },
      {
        "nickname": "Delta",
        "flight_count": 749,
        "destination_count": 13
      },
      {
        "nickname": "USAir",
        "flight_count": 542,
        "destination_count": 4
      },
      {
        "nickname": "Continental",
        "flight_count": 105,
        "destination_count": 2
      },
      {
        "nickname": "ATA",
        "flight_count": 78,
        "destination_count": 4
      },
      {
        "nickname": "American Eagle",
        "flight_count": 42,
        "destination_count": 2
      },
      {
        "nickname": "Continental Express",
        "flight_count": 26,
        "destination_count": 2
      },
      {
        "nickname": "Comair",
        "flight_count": 1,
        "destination_count": 1
      }
    ]
  }
]
WITH __stage0 AS (
  SELECT
    group_set,
    origin_0."state" as "state__0",
    CASE WHEN group_set=1 THEN
      carriers_0."nickname"
      END as "nickname__1",
    CASE WHEN group_set=1 THEN
      COUNT(1)
      END as "flight_count__1",
    CASE WHEN group_set=1 THEN
      COUNT(DISTINCT destination_0."code")
      END as "destination_count__1"
  FROM '../data/flights.parquet' as base
   LEFT JOIN '../data/airports.parquet' AS destination_0
    ON destination_0."code"=base."destination"
   LEFT JOIN '../data/airports.parquet' AS origin_0
    ON origin_0."code"=base."origin"
   LEFT JOIN '../data/carriers.parquet' AS carriers_0
    ON carriers_0."code"=base."carrier"
  CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set  ) as group_set
  GROUP BY 1,2,3
)
SELECT
  "state__0" as "state",
  COALESCE(LIST({
    "nickname": "nickname__1", 
    "flight_count": "flight_count__1", 
    "destination_count": "destination_count__1"}  ORDER BY  "flight_count__1" desc NULLS LAST) FILTER (WHERE group_set=1),[]) as "by_carrier"
FROM __stage0
GROUP BY 1
ORDER BY 1 asc NULLS LAST

Refining Views

A view can be "refined," which means adding clauses such as select:, group_by:, aggregate:, or where:. For example, let's define a source with a view and use that to create a query

document
source: flights4 is duckdb.table('../data/flights.parquet') extend {
  view: top_destinations is {
    group_by: destination
    aggregate: flight_count is count()
  }
}

Running the query gives us flight count by destination:

document
run: flights4 -> top_destinations
QUERY RESULTS
[
  {
    "destination": "ATL",
    "flight_count": 17832
  },
  {
    "destination": "DFW",
    "flight_count": 17776
  },
  {
    "destination": "ORD",
    "flight_count": 14213
  },
  {
    "destination": "PHX",
    "flight_count": 12477
  },
  {
    "destination": "LAS",
    "flight_count": 11092
  }
]
SELECT 
   base."destination" as "destination",
   COUNT(1) as "flight_count"
FROM '../data/flights.parquet' as base
GROUP BY 1
ORDER BY 2 desc NULLS LAST

Now let's refine it by adding + { group_by: origin }. This adds a group_by clause to the original query

document
run: flights4 -> top_destinations + {
  group_by: origin
}
QUERY RESULTS
[
  {
    "destination": "LGA",
    "flight_count": 2143,
    "origin": "DCA"
  },
  {
    "destination": "DCA",
    "flight_count": 2123,
    "origin": "LGA"
  },
  {
    "destination": "BOS",
    "flight_count": 1214,
    "origin": "LGA"
  },
  {
    "destination": "LGA",
    "flight_count": 1201,
    "origin": "BOS"
  },
  {
    "destination": "LAS",
    "flight_count": 1073,
    "origin": "LAX"
  }
]
SELECT 
   base."destination" as "destination",
   COUNT(1) as "flight_count",
   base."origin" as "origin"
FROM '../data/flights.parquet' as base
GROUP BY 1,3
ORDER BY 2 desc NULLS LAST

The query is now calculating flight_count grouped by both destination and origin.

Refinement can be thought of as similar to extending a class in object-oriented programming. The new view inherits the properties of the original, and adds new properties to it. (That said, it should not be confused with the similar concept of source extension.) This makes query logic much more reusable, since views can be easily saved and modified.

Multi-Stage Views

This example shows a view with 3 stages separated by ->. Each stage generates a CTE in the SQL.

document
run: duckdb.table('../data/flights.parquet') -> {
  select: *
  where: dep_time > @2003
} -> {    
  -- extend: allows you to define fields for use within the view
  extend: { measure: flight_count is count() }   
  aggregate: flight_count
  nest: main_view is {
    group_by: carrier
    aggregate: flight_count
  }
} -> {
  select:
    main_view.carrier
    main_view.flight_count
    # percent
    flight_count_as_a_percent_of_total is main_view.flight_count / flight_count
}
QUERY RESULTS
[
  {
    "carrier": "WN",
    "flight_count": 32189,
    "flight_count_as_a_percent_of_total": 0.22961636682692993
  },
  {
    "carrier": "UA",
    "flight_count": 13693,
    "flight_count_as_a_percent_of_total": 0.09767737149216041
  },
  {
    "carrier": "US",
    "flight_count": 12834,
    "flight_count_as_a_percent_of_total": 0.09154979812534775
  },
  {
    "carrier": "NW",
    "flight_count": 12175,
    "flight_count_as_a_percent_of_total": 0.08684890074615154
  },
  {
    "carrier": "RU",
    "flight_count": 12063,
    "flight_count_as_a_percent_of_total": 0.08604996219308633
  }
]
WITH __stage0 AS (
  SELECT 
     base."arr_delay" as "arr_delay",
     base."arr_time" as "arr_time",
     base."cancelled" as "cancelled",
     base."carrier" as "carrier",
     base."dep_delay" as "dep_delay",
     base."dep_time" as "dep_time",
     base."destination" as "destination",
     base."distance" as "distance",
     base."diverted" as "diverted",
     base."flight_num" as "flight_num",
     base."flight_time" as "flight_time",
     base."id2" as "id2",
     base."origin" as "origin",
     base."tail_num" as "tail_num",
     base."taxi_in" as "taxi_in",
     base."taxi_out" as "taxi_out"
  FROM '../data/flights.parquet' as base
  WHERE base."dep_time">=TIMESTAMP '2004-01-01 00:00:00'
)
, __stage1 AS (
  SELECT
    group_set,
    CASE WHEN group_set=0 THEN
      COUNT(1)
      END as "flight_count__0",
    CASE WHEN group_set=1 THEN
      base."carrier"
      END as "carrier__1",
    CASE WHEN group_set=1 THEN
      COUNT(1)
      END as "flight_count__1"
  FROM __stage0 as base
  CROSS JOIN (SELECT UNNEST(GENERATE_SERIES(0,1,1)) as group_set  ) as group_set
  GROUP BY 1,3
)
, __stage2 AS (
  SELECT
    MAX(CASE WHEN group_set=0 THEN "flight_count__0" END) as "flight_count",
    COALESCE(LIST({
      "carrier": "carrier__1", 
      "flight_count": "flight_count__1"}  ORDER BY  "flight_count__1" desc NULLS LAST) FILTER (WHERE group_set=1),[]) as "main_view"
  FROM __stage1
)
SELECT 
   main_view_0."carrier" as "carrier",
   main_view_0."flight_count" as "flight_count",
   main_view_0."flight_count"*1.0/base."flight_count" as "flight_count_as_a_percent_of_total"
FROM __stage2 as base
LEFT JOIN LATERAL (SELECT UNNEST(base."main_view"), 1 as ignoreme) as main_view_0_outer(main_view_0,ignoreme) ON main_view_0_outer.ignoreme=1