diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2019-05-16 13:18:54 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2019-05-16 14:33:11 +0100 |
commit | 2d3e0356b05f313707535802fd2df2e20d2cb8a0 (patch) | |
tree | 0fa21ce22c98f00afc225c51f656fbe58e142312 | |
parent | 384a04d561dd1b9d08278f12c85465533b984da6 (diff) |
filter-outliers
-rw-r--r-- | ministat.c | 125 |
1 files changed, 107 insertions, 18 deletions
@@ -132,8 +132,8 @@ struct dataset { char *name; double *points; unsigned lpoints; - double sy, syy; - unsigned n; + double sy, syy, median; + unsigned n, sn; }; static struct dataset * @@ -160,8 +160,6 @@ AddPoint(struct dataset *ds, double a) free(dp); } ds->points[ds->n++] = a; - ds->sy += a; - ds->syy += a * a; } static double @@ -182,44 +180,134 @@ static double Avg(struct dataset *ds) { - return(ds->sy / ds->n); + return(ds->sy / ds->sn); } static double -Median(struct dataset *ds) +Median(struct dataset *ds, int low, int high) { - int half = ds->n / 2; + int mid = (high - low) / 2 + low; - return (ds->points[half] + ds->points[ds->n - half - 1]) / 2; + return (ds->points[mid] + ds->points[ds->n - mid - 1]) / 2; } static double Var(struct dataset *ds) { - return (ds->syy - ds->sy * ds->sy / ds->n) / (ds->n - 1.0); + return (ds->syy - ds->sy * ds->sy / ds->sn) / (ds->sn - 1.0); } static double Stddev(struct dataset *ds) { - return sqrt(Var(ds)); } +static int +tukey(struct dataset *ds, int *low, int *high) +{ + int range = *high - *low; + int mid = range / 2 + *low; + double iqm = (ds->points[mid] + ds->points[ds->n - mid - 1]) / 2; + double iqr = ds->points[(3*range+3)/4 + *low] - ds->points[range/4 + *low]; + double lower = iqm - 1.5 * iqr; + double upper = iqm + 1.5 * iqr; + int shrunk = 0; + int i; + + for (i = *low; i < *high; i++) { + double a = ds->points[i]; + + if (a < lower) + shrunk = 1, *low = i + 1; + if (a > upper) + shrunk = 1, *high = i; + } + + return shrunk; +} + +static double tau(struct dataset *ds) +{ + double t; + + if (ds->sn > NSTUDENT + 2) + t = student[0][5]; + else + t = student[ds->sn - 2][5]; + t = t * (ds->sn - 1) / sqrt(ds->sn * (ds->sn - 2 + t * t)); + + return t * Stddev(ds); +} + +static int +thompson(struct dataset *ds, int *low, int *high) +{ + double dlow = fabs(ds->points[*low] - Avg(ds)); + double dhigh = fabs(ds->points[*high - 1] - Avg(ds)); + + if (dlow > dhigh) { + if (dlow > tau(ds)) { + double a = ds->points[*low]; + + ds->sy -= a; + ds->syy -= a * a; + ds->sn--; + ++*low; + return 1; + } + } else { + if (dhigh > tau(ds)) { + double a = ds->points[*high - 1]; + + ds->sy -= a; + ds->syy -= a * a; + ds->sn--; + --*high; + return 1; + } + } + + return 0; +} + +static void +FilterOutliers(struct dataset *ds) +{ + int i, low = 0, high = ds->n; + + while (tukey(ds, &low, &high)) + ; + + for (i = low; i < high; i++) { + double a = ds->points[i]; + + ds->sn++; + ds->sy += a; + ds->syy += a * a; + } + + while (thompson(ds, &low, &high)) + ; + + ds->median = Median(ds, low, high); +} + static void VitalsHead(void) { - printf(" N Min Max Median Avg Stddev\n"); + printf(" N Min Max Median Avg Stddev\n"); } static void Vitals(struct dataset *ds, int flag) { - printf("%c %3d %13.8g %13.8g %13.8g %13.8g %13.8g", symbol[1 << flag], - ds->n, Min(ds), Max(ds), Median(ds), Avg(ds), Stddev(ds)); + printf("%c %3d(%3d) %13.8g %13.8g %13.8g %13.8g %13.8g", + symbol[1 << flag], ds->n, ds->sn, + Min(ds), Max(ds), ds->median, Avg(ds), Stddev(ds)); printf("\n"); } @@ -229,15 +317,15 @@ Relative(struct dataset *ds, struct dataset *rs, int confidx) double spool, s, d, e, t; int i; - i = ds->n + rs->n - 2; + i = ds->sn + rs->sn - 2; if (i > NSTUDENT) t = student[0][confidx]; else t = student[i][confidx]; - spool = (ds->n - 1) * Var(ds) + (rs->n - 1) * Var(rs); - spool /= ds->n + rs->n - 2; + spool = (ds->sn - 1) * Var(ds) + (rs->sn - 1) * Var(rs); + spool /= ds->sn + rs->sn - 2; spool = sqrt(spool); - s = spool * sqrt(1.0 / ds->n + 1.0 / rs->n); + s = spool * sqrt(1.0 / ds->sn + 1.0 / rs->sn); d = Avg(ds) - Avg(rs); e = t * s; @@ -387,7 +475,7 @@ PlotSet(struct dataset *ds, int val) if (pl->bar[bar][i] == 0) pl->bar[bar][i] = '_'; } - x = floor((Median(ds) - pl->x0) / pl->dx); + x = floor((ds->median - pl->x0) / pl->dx); pl->bar[bar][x] = 'M'; x = floor((Avg(ds) - pl->x0) / pl->dx); pl->bar[bar][x] = 'A'; @@ -511,6 +599,7 @@ ReadSet(const char *n, int column, const char *delim) return NULL; } qsort(s->points, s->n, sizeof *s->points, dbl_cmp); + FilterOutliers(s); return (s); } |