MongoDB  2.7.0
descriptive_stats-inl.h
1 /* Copyright 2012 10gen Inc.
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  *
16  * Based upon boost.accumulators (www.boost.org/libs/accumulators/),
17  * distributed under the Boost Software License, Version 1.0.
18  * See distrc/THIRD_PARTY_NOTICES for the full License Notice for Boost.
19  *
20  */
21 
22 #pragma once
23 
24 #include <algorithm>
25 #include <limits>
26 
27 #include "mongo/util/mongoutils/str.h"
28 
29 namespace mongo {
30 
31  template <class Sample>
32  BasicEstimators<Sample>::BasicEstimators() :
33  _count(0),
34  _sum(0),
35  _diff(0),
36  _min(std::numeric_limits<Sample>::max()),
37  _max(std::numeric_limits<Sample>::min()) {
38 
39  }
40 
41  template <class Sample>
43  const double oldMean = (_count > 0) ? _sum / _count : 0;
44  const double delta = oldMean - static_cast<double>(sample);
45  const double weight = static_cast<double>(_count) / (_count + 1);
46  _diff += delta * delta * weight;
47  _sum += static_cast<double>(sample);
48  _count++;
49  _min = std::min(sample, _min);
50  _max = std::max(sample, _max);
51  return *this;
52  }
53 
54  template <class Sample>
56  b << "count" << static_cast<long long>(count())
57  << "mean" << mean()
58  << "stddev" << stddev()
59  << "min" << min()
60  << "max" << max();
61  }
62 
63  template <std::size_t NumQuantiles>
65  _count(0) {
66 
67  for(std::size_t i = 0; i < NumMarkers; i++) {
68  _actual_positions[i] = i + 1;
69  }
70 
71  for(std::size_t i = 0; i < NumMarkers; i++) {
72  _desired_positions[i] = 1.0 + (2.0 * (NumQuantiles + 1.0) * _positions_increments(i));
73  }
74  }
75 
76  /*
77  * The quantile estimation follows the extended_p_square implementation in boost.accumulators.
78  * It differs by removing the ability to request arbitrary quantiles and computing exactly
79  * 'NumQuantiles' equidistant quantiles (plus minimum and maximum) instead.
80  * See http://www.boost.org/doc/libs/1_51_0/doc/html/boost/accumulators/impl/extended_p_square_impl.html ,
81  * R. Jain and I. Chlamtac, The P^2 algorithmus for dynamic calculation of quantiles and histograms without storing observations, Communications of the ACM, Volume 28 (October), Number 10, 1985, p. 1076-1085. and
82  * K. E. E. Raatikainen, Simultaneous estimation of several quantiles, Simulation, Volume 49, Number 4 (October), 1986, p. 159-164.
83  */
84  template <std::size_t NumQuantiles>
85  DistributionEstimators<NumQuantiles>&
86  DistributionEstimators<NumQuantiles>::operator <<(const double sample) {
87 
88  // first accumulate num_markers samples
89  if (_count++ < NumMarkers) {
90  _heights[_count - 1] = sample;
91 
92  if (_count == NumMarkers)
93  {
94  std::sort(_heights, _heights + NumMarkers);
95  }
96  }
97  else {
98  std::size_t sample_cell = 1;
99 
100  // find cell k = sample_cell such that heights[k-1] <= sample < heights[k]
101  if(sample < _heights[0])
102  {
103  _heights[0] = sample;
104  sample_cell = 1;
105  }
106  else if (sample >= _heights[NumMarkers - 1])
107  {
108  _heights[NumMarkers - 1] = sample;
109  sample_cell = NumMarkers - 1;
110  }
111  else {
112  double* it = std::upper_bound(_heights,
113  _heights + NumMarkers,
114  sample);
115 
116  sample_cell = std::distance(_heights, it);
117  }
118 
119  // update actual positions of all markers above sample_cell index
120  for(std::size_t i = sample_cell; i < NumMarkers; i++) {
121  _actual_positions[i]++;
122  }
123 
124  // update desired positions of all markers
125  for(std::size_t i = 0; i < NumMarkers; i++) {
126  _desired_positions[i] += _positions_increments(i);
127  }
128 
129  // adjust heights and actual positions of markers 1 to num_markers-2 if necessary
130  for(std::size_t i = 1; i <= NumMarkers - 2; i++) {
131  // offset to desired position
132  double d = _desired_positions[i] - _actual_positions[i];
133 
134  // offset to next position
135  double dp = _actual_positions[i + 1] - _actual_positions[i];
136 
137  // offset to previous position
138  double dm = _actual_positions[i - 1] - _actual_positions[i];
139 
140  // height ds
141  double hp = (_heights[i + 1] - _heights[i]) / dp;
142  double hm = (_heights[i - 1] - _heights[i]) / dm;
143 
144  if((d >= 1 && dp > 1) || (d <= -1 && dm < -1))
145  {
146  short sign_d = static_cast<short>(d / std::abs(d));
147 
148  double h = _heights[i] + sign_d / (dp - dm) * ((sign_d - dm)*hp
149  + (dp - sign_d) * hm);
150 
151  // try adjusting heights[i] using p-squared formula
152  if(_heights[i - 1] < h && h < _heights[i + 1])
153  {
154  _heights[i] = h;
155  }
156  else
157  {
158  // use linear formula
159  if(d > 0)
160  {
161  _heights[i] += hp;
162  }
163  if(d < 0)
164  {
165  _heights[i] -= hm;
166  }
167  }
168  _actual_positions[i] += sign_d;
169  }
170  }
171  }
172 
173  return *this;
174  }
175 
176  template <std::size_t NumQuantiles>
178  BSONArrayBuilder& arr) const {
179 
180  verify(quantilesReady());
181  for (std::size_t i = 0; i <= NumQuantiles + 1; i++) {
182  arr << quantile(i);
183  }
184  }
185 
186  template <std::size_t NumQuantiles>
187  inline double DistributionEstimators<NumQuantiles>::_positions_increments(std::size_t i) const {
188  return static_cast<double>(i) / (2 * (NumQuantiles + 1));
189  }
190 
191  template <class Sample, std::size_t NumQuantiles>
193  BSONObjBuilder b;
196  // Not using appendQuantiles to be explicit about which probability each quantile
197  // refers to. This way the user does not need to count the quantiles or know in
198  // advance how many quantiles were computed to figure out their meaning.
199  BSONObjBuilder quantilesBuilder(b.subobjStart("quantiles"));
200  for (size_t i = 1; i <= NumQuantiles; i++) {
201  const double probability =
203  const double quantile =
205  quantilesBuilder.append(std::string(mongoutils::str::stream() << probability),
206  quantile);
207  }
208  quantilesBuilder.doneFast();
209  }
210  return b.obj();
211  }
212 
213 } // namespace mongo
Utility for creating a BSONObj.
Definition: bsonobjbuilder.h:52
Collects count, minimum and maximum, calculates mean and standard deviation.
Definition: descriptive_stats.h:49
the idea here is to make one liners easy.
Definition: str.h:48
BSONObj obj()
destructive The returned BSONObj will free the buffer when it is finished.
Definition: bsonobjbuilder.h:573
BufBuilder & subobjStart(const StringData &fieldName)
add header for a new subobject and return bufbuilder for writing to the subobject's body ...
Definition: bsonobjbuilder.h:134
void appendQuantilesToBSONArrayBuilder(BSONArrayBuilder &arr) const
Appends the quantiles to the provided BSONArrayBuilder.
Definition: descriptive_stats-inl.h:177
BSONObj statisticSummaryToBSONObj() const
Definition: descriptive_stats-inl.h:192
BasicEstimators & operator<<(const Sample sample)
Update estimators with another observed value.
Definition: descriptive_stats-inl.h:42
double quantile(std::size_t i) const
Updates the estimators with another observed value.
Definition: descriptive_stats.h:125
double probability(std::size_t i) const
Definition: descriptive_stats.h:166
Computes 'NumQuantiles' quantiles.
Definition: descriptive_stats.h:111
C++ representation of a "BSON" object – that is, an extended JSON-style object in a binary representa...
Definition: bsonobj.h:77
Definition: bsonobjbuilder.h:705
void appendBasicToBSONObjBuilder(BSONObjBuilder &b) const
Appends the basic estimators to the provided BSONObjBuilder.
Definition: descriptive_stats-inl.h:55