1313
1414Reduce dataset to selected columns, optionally save it under a different name.
1515"""
16- function dselect (
17- dInfo:: Dinfo ,
18- columns:: Vector{Int} ,
19- tgt:: Symbol = dInfo. val,
20- ):: Dinfo
16+ function dselect (dInfo:: Dinfo , columns:: Vector{Int} , tgt:: Symbol = dInfo. val):: Dinfo
2117 dtransform (dInfo, mtx -> mtx[:, columns], tgt)
2218end
2319
9187Compute mean and standard deviation of the columns in dataset. Returns a tuple
9288with a vector of means in `columns`, and a vector of corresponding sdevs.
9389"""
94- function dstat (
95- dInfo:: Dinfo ,
96- columns:: Vector{Int} ,
97- ):: Tuple{Vector{Float64},Vector{Float64}}
90+ function dstat (dInfo:: Dinfo , columns:: Vector{Int} ):: Tuple{Vector{Float64},Vector{Float64}}
9891
9992 sum_squares = x -> sum (x .^ 2 )
10093
@@ -136,8 +129,7 @@ function dstat_buckets(
136129 )
137130
138131 # extract the bucketed stats
139- (sums, sqsums, ns) =
140- dmapreduce ([dInfo, buckets], get_bucketed_stats, combine_stats)
132+ (sums, sqsums, ns) = dmapreduce ([dInfo, buckets], get_bucketed_stats, combine_stats)
141133
142134 return (
143135 sums ./ ns, # means
@@ -285,7 +277,8 @@ less or higher than `targets`.
285277"""
286278function update_extrema (counts, targets, lims, mids)
287279 broadcast (
288- (cnt, target, lim, mid) -> cnt >= target ? # if the count is too high,
280+ (cnt, target, lim, mid) ->
281+ cnt >= target ? # if the count is too high,
289282 (lim[1 ], mid) : # median is going to be in the lower half
290283 (mid, lim[2 ]), # otherwise in the higher half
291284 counts,
@@ -313,11 +306,8 @@ function dmedian(dInfo::Dinfo, columns::Vector{Int}; iters = 20)
313306 target = dmapreduce (dInfo, d -> size (d, 1 ), + ) ./ 2
314307
315308 # current estimation range for the median (tuples of min, max)
316- lims = dmapreduce (
317- dInfo,
318- d -> mapslices (extrema, d[:, columns], dims = 1 ),
319- reduce_extrema,
320- )
309+ lims =
310+ dmapreduce (dInfo, d -> mapslices (extrema, d[:, columns], dims = 1 ), reduce_extrema)
321311
322312 # convert the limits to a simple vector
323313 lims = cat (lims... , dims = 1 )
@@ -368,8 +358,8 @@ function dmedian_buckets(
368358 get_bucket_extrema =
369359 (d, b) -> catmapbuckets (
370360 (_, x) -> length (x) > 0 ? # if there are some elements
371- extrema (x) : # just take the extrema
372- (Inf , - Inf ), # if not, use backup values
361+ extrema (x) : # just take the extrema
362+ (Inf , - Inf ), # if not, use backup values
373363 d[:, columns],
374364 nbuckets,
375365 b,
@@ -384,21 +374,22 @@ function dmedian_buckets(
384374 # this counts the elements smaller than mids in buckets
385375 # (both mids and elements are bucketed and column-sliced into matrices)
386376 bucketed_count_smaller_than_mids =
387- (d, b) -> vcat (mapbuckets (
388- (bucketID, d) ->
389- [
390- count (x -> x < mids[bucketID, colID], d[:, colID])
391- for (colID, c) in enumerate (columns)
392- ]' ,
393- d,
394- nbuckets,
395- b,
396- slicedims = (1 , 2 ),
397- )... )
377+ (d, b) -> vcat (
378+ mapbuckets (
379+ (bucketID, d) ->
380+ [
381+ count (x -> x < mids[bucketID, colID], d[:, colID]) for
382+ (colID, c) in enumerate (columns)
383+ ]' ,
384+ d,
385+ nbuckets,
386+ b,
387+ slicedims = (1 , 2 ),
388+ )... ,
389+ )
398390
399391 # gather the counts
400- counts =
401- dmapreduce ([dInfo, buckets], bucketed_count_smaller_than_mids, + )
392+ counts = dmapreduce ([dInfo, buckets], bucketed_count_smaller_than_mids, + )
402393
403394 lims = update_extrema (counts, targets, lims, mids)
404395 end
0 commit comments