在聚合的分組統計中我們會面臨兩種分組元素類型:連續型如時間,自然數等、離散型如地點、產品等。離散型數據本身就代表不同的組別,但連續型數據則需要手工按等長間隔進行切分了。下麵是一個按價錢段聚合的例子: POST /cartxns/_search { "size" : 1, "aggs": { "sal ...
在聚合的分組統計中我們會面臨兩種分組元素類型:連續型如時間,自然數等、離散型如地點、產品等。離散型數據本身就代表不同的組別,但連續型數據則需要手工按等長間隔進行切分了。下麵是一個按價錢段聚合的例子:
POST /cartxns/_search
{
"size" : 1,
"aggs": {
"sales_per_pricerange": {
"histogram": {
"field": "price",
"interval": 20000
},
"aggs": {
"total sales": {
"sum": {
"field": "price"
}
}
}
}
}
}
}
在上面這個例子中我們把價錢按20000進行分段。得出0-19999,20000-39999,40000-59999 ... 價格段的度量:
"aggregations" : {
"sales_per_pricerange" : {
"buckets" : [
{
"key" : 0.0,
"doc_count" : 3,
"total sales" : {
"value" : 37000.0
}
},
{
"key" : 20000.0,
"doc_count" : 4,
"total sales" : {
"value" : 95000.0
}
},
{
"key" : 40000.0,
"doc_count" : 0,
"total sales" : {
"value" : 0.0
}
},
{
"key" : 60000.0,
"doc_count" : 0,
"total sales" : {
"value" : 0.0
}
},
{
"key" : 80000.0,
"doc_count" : 1,
"total sales" : {
"value" : 80000.0
}
}
]
}
}
在elastic4s中是這樣表達的:
val aggHist = search("cartxns").aggregations(
histogramAggregation("sales_per_price")
.field("price")
.interval(20000).subAggregations(
sumAggregation("total_sales").field("price")
)
)
println(aggHist.show)
val histResult = client.execute(aggHist).await
if (histResult.isSuccess)
histResult.result.aggregations.histogram("sales_per_price").buckets
.foreach(hb => println(s"${hb.key},${hb.docCount}:${hb.sum("total_sales").value}"))
else println(s"error: ${histResult.error.reason}")
....
POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_price":{"histogram":{"interval":20000.0,"field":"price"},"aggs":{"total_sales":{"sum":{"field":"price"}}}}}},Some(application/json))
0.0,3:37000.0
20000.0,4:95000.0
40000.0,0:0.0
60000.0,0:0.0
80000.0,1:80000.0
下麵這個按車款分組統計的就是一個離散元素的聚合統計了:
POST /cartxns/_search
{
"size" : 1,
"aggs": {
"avage price per model" : {
"terms": {"field" : "make.keyword"},
"aggs": {
"average price": {
"avg": {"field": "price"}
},
"max price" : {
"max": {
"field": "price"
}
},
"min price" : {
"min": {
"field": "price"
}
}
}
}
}
}
我們可以得到每一款車的平均售價、最低最高售價:
"aggregations" : {
"avage price per model" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "honda",
"doc_count" : 3,
"max price" : {
"value" : 20000.0
},
"average price" : {
"value" : 16666.666666666668
},
"min price" : {
"value" : 10000.0
}
},
{
"key" : "ford",
"doc_count" : 2,
"max price" : {
"value" : 30000.0
},
"average price" : {
"value" : 27500.0
},
"min price" : {
"value" : 25000.0
}
},
{
"key" : "toyota",
"doc_count" : 2,
"max price" : {
"value" : 15000.0
},
"average price" : {
"value" : 13500.0
},
"min price" : {
"value" : 12000.0
}
},
{
"key" : "bmw",
"doc_count" : 1,
"max price" : {
"value" : 80000.0
},
"average price" : {
"value" : 80000.0
},
"min price" : {
"value" : 80000.0
}
}
]
}
}
elastic4s示範如下:
val aggDisc = search("cartxns").aggregations(
termsAgg("prices_per_model","make.keyword").subAggregations(
avgAgg("average_price","price"),
minAgg("min_price","price"),
maxAgg("max_price","price")
)
)
println(aggDisc.show)
val discResult = client.execute(aggDisc).await
if (discResult.isSuccess)
discResult.result.aggregations.terms("prices_per_model").buckets
.foreach(mb =>
println(s"${mb.key},${mb.docCount}:${mb.avg("average_price").value}," +
s"${mb.min("min_price").value.getOrElse(0)}," +
s"${mb.max("max_price").value.getOrElse(0)}"))
else println(s"error: ${discResult.error.causedBy.getOrElse("unknown")}")
...
POST:/cartxns/_search?
StringEntity({"aggs":{"prices_per_model":{"terms":{"field":"make.keyword"},"aggs":{"average_price":{"avg":{"field":"price"}},"min_price":{"min":{"field":"price"}},"max_price":{"max":{"field":"price"}}}}}},Some(application/json))
honda,3:16666.666666666668,10000.0,20000.0
ford,2:27500.0,25000.0,30000.0
toyota,2:13500.0,12000.0,15000.0
bmw,1:80000.0,80000.0,80000.0
date_histogram是一種按時間間隔聚合的統計方法。對於按時間趨勢變化的數據分析十分有用:
POST /cartxns/_search
{
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "sold",
"calendar_interval":"1M",
"format": "yyyy-MM-dd"
}
}
}
}
...
"aggregations" : {
"sales_per_month" : {
"buckets" : [
{
"key_as_string" : "2014-01-01",
"key" : 1388534400000,
"doc_count" : 1
},
{
"key_as_string" : "2014-02-01",
"key" : 1391212800000,
"doc_count" : 1
},
{
"key_as_string" : "2014-03-01",
"key" : 1393632000000,
"doc_count" : 0
},
{
"key_as_string" : "2014-04-01",
"key" : 1396310400000,
"doc_count" : 0
},
{
"key_as_string" : "2014-05-01",
"key" : 1398902400000,
"doc_count" : 1
},
{
"key_as_string" : "2014-06-01",
"key" : 1401580800000,
"doc_count" : 0
},
{
"key_as_string" : "2014-07-01",
"key" : 1404172800000,
"doc_count" : 1
},
{
"key_as_string" : "2014-08-01",
"key" : 1406851200000,
"doc_count" : 1
},
{
"key_as_string" : "2014-09-01",
"key" : 1409529600000,
"doc_count" : 0
},
{
"key_as_string" : "2014-10-01",
"key" : 1412121600000,
"doc_count" : 1
},
{
"key_as_string" : "2014-11-01",
"key" : 1414800000000,
"doc_count" : 2
}
]
}
}
上面這個例子產生以月為單元的bucket。elastic4s示範:
val aggDateHist = search("cartxns").aggregations(
dateHistogramAggregation("sales_per_month")
.field("sold")
.calendarInterval(DateHistogramInterval.Month)
.format("yyyy-MM-dd")
.minDocCount(1)
)
println(aggDateHist.show)
val dtHistResult = client.execute(aggDateHist).await
if (dtHistResult.isSuccess)
dtHistResult.result.aggregations.dateHistogram("sales_per_month").buckets
.foreach(db => println(s"${db.date},${db.docCount}"))
else println(s"error: ${dtHistResult.error.causedBy.getOrElse("unknown")}")
...
POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"}}}},Some(application/json))
2014-01-01,1
2014-02-01,1
2014-05-01,1
2014-07-01,1
2014-08-01,1
2014-10-01,1
2014-11-01,2
在以月劃分bucket後可以再進行每個月的深度聚合:
POST /cartxns/_search
{
"aggs": {
"sales_per_month": {
"date_histogram": {
"field": "sold",
"calendar_interval":"1M",
"format": "yyyy-MM-dd"
},
"aggs": {
"per_make_sum": {
"terms": {
"field": "make.keyword",
"size": 10
},
"aggs": {
"sum_price": {
"sum": {"field": "price"}
}
}
},
"total_sum": {
"sum": {
"field": "price"
}
}
}
}
}
}
我們可以得到每個月的銷售總額、每個車款每個月的銷售,如下:
"aggregations" : {
"sales_per_month" : {
"buckets" : [
{
"key_as_string" : "2014-01-01",
"key" : 1388534400000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "bmw",
"doc_count" : 1,
"sum_price" : {
"value" : 80000.0
}
}
]
},
"total_sum" : {
"value" : 80000.0
}
},
{
"key_as_string" : "2014-02-01",
"key" : 1391212800000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "ford",
"doc_count" : 1,
"sum_price" : {
"value" : 25000.0
}
}
]
},
"total_sum" : {
"value" : 25000.0
}
},
{
"key_as_string" : "2014-03-01",
"key" : 1393632000000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-04-01",
"key" : 1396310400000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-05-01",
"key" : 1398902400000,
"doc_count" : 1,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "ford",
"doc_count" : 1,
"sum_price" : {
"value" : 30000.0
}
}
]
},
"total_sum" : {
"value" : 30000.0
}
},
{
"key_as_string" : "2014-06-01",
"key" : 1401580800000,
"doc_count" : 0,
"per_make_sum" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"total_sum" : {
"value" : 0.0
}
},
{
"key_as_string" : "2014-07-01",
"key" : 1404172800000,
"