summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2019-05-16 13:18:54 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2019-05-16 14:33:11 +0100
commit2d3e0356b05f313707535802fd2df2e20d2cb8a0 (patch)
tree0fa21ce22c98f00afc225c51f656fbe58e142312
parent384a04d561dd1b9d08278f12c85465533b984da6 (diff)
filter-outliers
-rw-r--r--ministat.c125
1 files changed, 107 insertions, 18 deletions
diff --git a/ministat.c b/ministat.c
index 38ac2ea..e1c5982 100644
--- a/ministat.c
+++ b/ministat.c
@@ -132,8 +132,8 @@ struct dataset {
char *name;
double *points;
unsigned lpoints;
- double sy, syy;
- unsigned n;
+ double sy, syy, median;
+ unsigned n, sn;
};
static struct dataset *
@@ -160,8 +160,6 @@ AddPoint(struct dataset *ds, double a)
free(dp);
}
ds->points[ds->n++] = a;
- ds->sy += a;
- ds->syy += a * a;
}
static double
@@ -182,44 +180,134 @@ static double
Avg(struct dataset *ds)
{
- return(ds->sy / ds->n);
+ return(ds->sy / ds->sn);
}
static double
-Median(struct dataset *ds)
+Median(struct dataset *ds, int low, int high)
{
- int half = ds->n / 2;
+ int mid = (high - low) / 2 + low;
- return (ds->points[half] + ds->points[ds->n - half - 1]) / 2;
+ return (ds->points[mid] + ds->points[ds->n - mid - 1]) / 2;
}
static double
Var(struct dataset *ds)
{
- return (ds->syy - ds->sy * ds->sy / ds->n) / (ds->n - 1.0);
+ return (ds->syy - ds->sy * ds->sy / ds->sn) / (ds->sn - 1.0);
}
static double
Stddev(struct dataset *ds)
{
-
return sqrt(Var(ds));
}
+static int
+tukey(struct dataset *ds, int *low, int *high)
+{
+ int range = *high - *low;
+ int mid = range / 2 + *low;
+ double iqm = (ds->points[mid] + ds->points[ds->n - mid - 1]) / 2;
+ double iqr = ds->points[(3*range+3)/4 + *low] - ds->points[range/4 + *low];
+ double lower = iqm - 1.5 * iqr;
+ double upper = iqm + 1.5 * iqr;
+ int shrunk = 0;
+ int i;
+
+ for (i = *low; i < *high; i++) {
+ double a = ds->points[i];
+
+ if (a < lower)
+ shrunk = 1, *low = i + 1;
+ if (a > upper)
+ shrunk = 1, *high = i;
+ }
+
+ return shrunk;
+}
+
+static double tau(struct dataset *ds)
+{
+ double t;
+
+ if (ds->sn > NSTUDENT + 2)
+ t = student[0][5];
+ else
+ t = student[ds->sn - 2][5];
+ t = t * (ds->sn - 1) / sqrt(ds->sn * (ds->sn - 2 + t * t));
+
+ return t * Stddev(ds);
+}
+
+static int
+thompson(struct dataset *ds, int *low, int *high)
+{
+ double dlow = fabs(ds->points[*low] - Avg(ds));
+ double dhigh = fabs(ds->points[*high - 1] - Avg(ds));
+
+ if (dlow > dhigh) {
+ if (dlow > tau(ds)) {
+ double a = ds->points[*low];
+
+ ds->sy -= a;
+ ds->syy -= a * a;
+ ds->sn--;
+ ++*low;
+ return 1;
+ }
+ } else {
+ if (dhigh > tau(ds)) {
+ double a = ds->points[*high - 1];
+
+ ds->sy -= a;
+ ds->syy -= a * a;
+ ds->sn--;
+ --*high;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void
+FilterOutliers(struct dataset *ds)
+{
+ int i, low = 0, high = ds->n;
+
+ while (tukey(ds, &low, &high))
+ ;
+
+ for (i = low; i < high; i++) {
+ double a = ds->points[i];
+
+ ds->sn++;
+ ds->sy += a;
+ ds->syy += a * a;
+ }
+
+ while (thompson(ds, &low, &high))
+ ;
+
+ ds->median = Median(ds, low, high);
+}
+
static void
VitalsHead(void)
{
- printf(" N Min Max Median Avg Stddev\n");
+ printf(" N Min Max Median Avg Stddev\n");
}
static void
Vitals(struct dataset *ds, int flag)
{
- printf("%c %3d %13.8g %13.8g %13.8g %13.8g %13.8g", symbol[1 << flag],
- ds->n, Min(ds), Max(ds), Median(ds), Avg(ds), Stddev(ds));
+ printf("%c %3d(%3d) %13.8g %13.8g %13.8g %13.8g %13.8g",
+ symbol[1 << flag], ds->n, ds->sn,
+ Min(ds), Max(ds), ds->median, Avg(ds), Stddev(ds));
printf("\n");
}
@@ -229,15 +317,15 @@ Relative(struct dataset *ds, struct dataset *rs, int confidx)
double spool, s, d, e, t;
int i;
- i = ds->n + rs->n - 2;
+ i = ds->sn + rs->sn - 2;
if (i > NSTUDENT)
t = student[0][confidx];
else
t = student[i][confidx];
- spool = (ds->n - 1) * Var(ds) + (rs->n - 1) * Var(rs);
- spool /= ds->n + rs->n - 2;
+ spool = (ds->sn - 1) * Var(ds) + (rs->sn - 1) * Var(rs);
+ spool /= ds->sn + rs->sn - 2;
spool = sqrt(spool);
- s = spool * sqrt(1.0 / ds->n + 1.0 / rs->n);
+ s = spool * sqrt(1.0 / ds->sn + 1.0 / rs->sn);
d = Avg(ds) - Avg(rs);
e = t * s;
@@ -387,7 +475,7 @@ PlotSet(struct dataset *ds, int val)
if (pl->bar[bar][i] == 0)
pl->bar[bar][i] = '_';
}
- x = floor((Median(ds) - pl->x0) / pl->dx);
+ x = floor((ds->median - pl->x0) / pl->dx);
pl->bar[bar][x] = 'M';
x = floor((Avg(ds) - pl->x0) / pl->dx);
pl->bar[bar][x] = 'A';
@@ -511,6 +599,7 @@ ReadSet(const char *n, int column, const char *delim)
return NULL;
}
qsort(s->points, s->n, sizeof *s->points, dbl_cmp);
+ FilterOutliers(s);
return (s);
}