From e38cabac80bb16620a871e80e7408d03ba89b8f0 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 15 Feb 2018 15:09:31 +0900 Subject: [PATCH 1/6] Add Enumerable#sample(n=1, ...) - Only support the case of n == 1 --- .../statistics/extension/statistics.c | 119 ++++++++++++++++++ spec/enum/sample_spec.rb | 94 ++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 spec/enum/sample_spec.rb diff --git a/ext/enumerable/statistics/extension/statistics.c b/ext/enumerable/statistics/extension/statistics.c index fb5abd2..100ec17 100644 --- a/ext/enumerable/statistics/extension/statistics.c +++ b/ext/enumerable/statistics/extension/statistics.c @@ -1413,6 +1413,124 @@ enum_stdev(int argc, VALUE* argv, VALUE obj) return stdev; } +#if SIZEOF_SIZE_T == SIZEOF_LONG +static inline size_t +random_usize_limited(VALUE rnd, size_t max) +{ + return (size_t)rb_random_ulong_limited(rnd, max); +} +#else +static inline size_t +random_usize_limited(VALUE rnd, size_t max) +{ + if (max <= ULONG_MAX) { + return (size_t)rb_random_ulong_limited(rnd, (unsigned long)max); + } + else { + VALUE num = rb_random_int(rnd, SIZET2NUM(max)); + return NUM2SIZET(num); + } +} +#endif + +struct sample_single_memo { + size_t k; + VALUE sample; + VALUE random; +}; + +static VALUE +enum_sample_single_i(RB_BLOCK_CALL_FUNC_ARGLIST(e, data)) +{ + struct sample_single_memo *memo = (struct sample_single_memo *)data; + ENUM_WANT_SVALUE(); + + if (++memo->k <= 1) { + memo->sample = e; + } + else { + size_t j = random_usize_limited(memo->random, memo->k - 1); + if (j == 1) { + memo->sample = e; + } + } + + return Qnil; +} + +static VALUE +enum_sample_single(VALUE obj, VALUE random) +{ + struct sample_single_memo memo; + + memo.k = 0; + memo.sample = Qundef; + memo.random = random; + + rb_block_call(obj, id_each, 0, 0, enum_sample_single_i, (VALUE)&memo); + + return memo.sample; +} + +static VALUE +enum_sample_multiple_unweighted(VALUE obj, long size, int replace_p) +{ + assert(size > 1); + + return Qnil; +} + +/* call-seq: + * enum.sample(n=1, random: Random, replace: false) + */ +static VALUE +enum_sample(int argc, VALUE *argv, VALUE obj) +{ + VALUE size_v, random_v, replace_v, weights_v, opts; + long size; + int replace_p; + + random_v = rb_cRandom; + replace_v = Qundef; + weights_v = Qundef; + + if (argc == 0) goto single; + + rb_scan_args(argc, argv, "01:", &size_v, &opts); + size = NIL_P(size_v) ? 1 : NUM2LONG(size_v); + + if (size == 1 && NIL_P(opts)) { + goto single; + } + + if (!NIL_P(opts)) { + static ID keywords[3]; + VALUE kwargs[3]; + if (!keywords[0]) { + keywords[0] = rb_intern("random"); + keywords[1] = rb_intern("replace"); + /* keywords[2] = rb_intern("weights"); */ + } + rb_get_kwargs(opts, keywords, 0, 2, kwargs); + random_v = kwargs[0]; + replace_v = kwargs[1]; + /* weights_v = kwargs[2]; */ + } + if (random_v == Qundef) { + random_v = rb_cRandom; + } + + if (size == 1) { +single: + return enum_sample_single(obj, random_v); + } + + replace_p = (replace_v == Qundef) ? 1 : RTEST(replace_v); + + return enum_sample_unweighted(obj, NUM2LONG(size), replace_p); +} + + /* call-seq: * ary.mean_stdev(population: false) * @@ -1479,6 +1597,7 @@ Init_extension(void) rb_define_method(rb_mEnumerable, "variance", enum_variance, -1); rb_define_method(rb_mEnumerable, "mean_stdev", enum_mean_stdev, -1); rb_define_method(rb_mEnumerable, "stdev", enum_stdev, -1); + rb_define_method(rb_mEnumerable, "sample", enum_sample, -1); #ifndef HAVE_ARRAY_SUM rb_define_method(rb_cArray, "sum", ary_sum, -1); diff --git a/spec/enum/sample_spec.rb b/spec/enum/sample_spec.rb new file mode 100644 index 0000000..35c25bb --- /dev/null +++ b/spec/enum/sample_spec.rb @@ -0,0 +1,94 @@ +require 'spec_helper' +require 'enumerable/statistics' + +RSpec.describe Enumerable, '#sample' do + let(:random) { Random.new } + let(:n) { 20 } + + context 'without weight' do + let(:enum) { 1.upto(100000) } + + context 'without size' do + context 'without rng' do + context 'without weight' do + specify do + result = enum.sample + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample } + expect(other_results).not_to be_all(eq result) + end + end + end + + context 'with rng' do + specify do + save_random = random.dup + result = enum.sample(random: random) + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample(random: save_random.dup) } + expect(other_results).to be_all(eq result) + end + end + end + + context 'with size (== 1)' do + context 'without rng' do + context 'without weight' do + specify do + result = enum.sample(1) + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample(1) } + expect(other_results).not_to be_all(eq result) + end + end + end + + context 'with rng' do + specify do + save_random = random.dup + result = enum.sample(1, random: random) + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample(1, random: save_random.dup) } + expect(other_results).to be_all(eq result) + end + end + end + + context 'with size (> 1)' do + context 'without replacement' do + context 'without rng' do + subject(:result) { enum.sample(n) } + + specify do + result = enum.sample(n) + expect(result).to be_an(Array) + expect(result.length).to eq(n) + other_results = Array.new(100) { enum.sample(n) } + expect(other_results).not_to be_all(eq result) + end + end + + context 'with rng' do + subject(:result) { enum.sample(n, random: random) } + + specify do + save_random = random.dup + result = enum.sample(n, random: random) + expect(result).to be_an(Array) + expect(result.length).to eq(n) + other_results = Array.new(100) { enum.sample(n, random: random) } + expect(other_results).to be_all(eq result) + end + end + end + + context 'with replacement' do + pending + end + end + end + + context 'with weight' do + pending + end +end From 317886e23ea46aec81ee2d5f4d343d9e70d4d099 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Thu, 15 Feb 2018 15:43:18 +0900 Subject: [PATCH 2/6] Support n > 1 in Enumerable#sample --- .../statistics/extension/statistics.c | 50 ++++++++++++++++--- spec/enum/sample_spec.rb | 16 +++--- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/ext/enumerable/statistics/extension/statistics.c b/ext/enumerable/statistics/extension/statistics.c index 100ec17..849cef3 100644 --- a/ext/enumerable/statistics/extension/statistics.c +++ b/ext/enumerable/statistics/extension/statistics.c @@ -1433,8 +1433,9 @@ random_usize_limited(VALUE rnd, size_t max) } #endif -struct sample_single_memo { +struct enum_sample_memo { size_t k; + long n; VALUE sample; VALUE random; }; @@ -1442,7 +1443,7 @@ struct sample_single_memo { static VALUE enum_sample_single_i(RB_BLOCK_CALL_FUNC_ARGLIST(e, data)) { - struct sample_single_memo *memo = (struct sample_single_memo *)data; + struct enum_sample_memo *memo = (struct enum_sample_memo *)data; ENUM_WANT_SVALUE(); if (++memo->k <= 1) { @@ -1461,9 +1462,10 @@ enum_sample_single_i(RB_BLOCK_CALL_FUNC_ARGLIST(e, data)) static VALUE enum_sample_single(VALUE obj, VALUE random) { - struct sample_single_memo memo; + struct enum_sample_memo memo; memo.k = 0; + memo.n = 1; memo.sample = Qundef; memo.random = random; @@ -1473,13 +1475,46 @@ enum_sample_single(VALUE obj, VALUE random) } static VALUE -enum_sample_multiple_unweighted(VALUE obj, long size, int replace_p) +enum_sample_multiple_without_replace_unweighted_i(RB_BLOCK_CALL_FUNC_ARGLIST(e, data)) { - assert(size > 1); + struct enum_sample_memo *memo = (struct enum_sample_memo *)data; + ENUM_WANT_SVALUE(); + + if (++memo->k <= memo->n) { + rb_ary_push(memo->sample, e); + } + else { + size_t j = random_usize_limited(memo->random, memo->k - 1); + if (j <= memo->n) { + rb_ary_store(memo->sample, (long)(j - 1), e); + } + } return Qnil; } +static VALUE +enum_sample_multiple_unweighted(VALUE obj, long size, VALUE random, int replace_p) +{ + struct enum_sample_memo memo; + + assert(size > 1); + + memo.k = 0; + memo.n = size; + memo.sample = rb_ary_new_capa(size); + memo.random = random; + + if (replace_p) { + return Qnil; + } + else { + rb_block_call(obj, id_each, 0, 0, enum_sample_multiple_without_replace_unweighted_i, (VALUE)&memo); + } + + return memo.sample; +} + /* call-seq: * enum.sample(n=1, random: Random, replace: false) */ @@ -1516,6 +1551,7 @@ enum_sample(int argc, VALUE *argv, VALUE obj) replace_v = kwargs[1]; /* weights_v = kwargs[2]; */ } + if (random_v == Qundef) { random_v = rb_cRandom; } @@ -1525,9 +1561,9 @@ enum_sample(int argc, VALUE *argv, VALUE obj) return enum_sample_single(obj, random_v); } - replace_p = (replace_v == Qundef) ? 1 : RTEST(replace_v); + replace_p = (replace_v == Qundef) ? 0 : RTEST(replace_v); - return enum_sample_unweighted(obj, NUM2LONG(size), replace_p); + return enum_sample_multiple_unweighted(obj, size, random_v, replace_p); } diff --git a/spec/enum/sample_spec.rb b/spec/enum/sample_spec.rb index 35c25bb..f2cd119 100644 --- a/spec/enum/sample_spec.rb +++ b/spec/enum/sample_spec.rb @@ -15,7 +15,7 @@ result = enum.sample expect(result).to be_an(Integer) other_results = Array.new(100) { enum.sample } - expect(other_results).not_to be_all(eq result) + expect(other_results).not_to be_all {|i| i == result } end end end @@ -26,7 +26,7 @@ result = enum.sample(random: random) expect(result).to be_an(Integer) other_results = Array.new(100) { enum.sample(random: save_random.dup) } - expect(other_results).to be_all(eq result) + expect(other_results).to be_all {|i| i == result } end end end @@ -38,7 +38,7 @@ result = enum.sample(1) expect(result).to be_an(Integer) other_results = Array.new(100) { enum.sample(1) } - expect(other_results).not_to be_all(eq result) + expect(other_results).not_to be_all {|i| i == result } end end end @@ -49,7 +49,7 @@ result = enum.sample(1, random: random) expect(result).to be_an(Integer) other_results = Array.new(100) { enum.sample(1, random: save_random.dup) } - expect(other_results).to be_all(eq result) + expect(other_results).to be_all {|i| i == result } end end end @@ -63,8 +63,9 @@ result = enum.sample(n) expect(result).to be_an(Array) expect(result.length).to eq(n) + expect(result.uniq.length).to eq(n) other_results = Array.new(100) { enum.sample(n) } - expect(other_results).not_to be_all(eq result) + expect(other_results).not_to be_all {|i| i == result } end end @@ -76,8 +77,9 @@ result = enum.sample(n, random: random) expect(result).to be_an(Array) expect(result.length).to eq(n) - other_results = Array.new(100) { enum.sample(n, random: random) } - expect(other_results).to be_all(eq result) + expect(result.uniq.length).to eq(n) + other_results = Array.new(100) { enum.sample(n, random: save_random.dup) } + expect(other_results).to be_all {|i| i == result } end end end From 84e77dd1b10c32004f6ecde57415733777065e55 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 16 Feb 2018 10:08:02 +0900 Subject: [PATCH 3/6] Add examples --- spec/enum/sample_spec.rb | 105 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/spec/enum/sample_spec.rb b/spec/enum/sample_spec.rb index f2cd119..1f8bf6e 100644 --- a/spec/enum/sample_spec.rb +++ b/spec/enum/sample_spec.rb @@ -5,6 +5,107 @@ let(:random) { Random.new } let(:n) { 20 } + let(:replace) { nil } + let(:weights) { nil } + let(:opts) { {} } + + before do + opts[:replace] = replace if replace + opts[:weights] = weights if weights + end + + context 'when the receiver has 1 item' do + let(:enum) { 1.upto(1) } + + shared_examples_for '1-item enumerable' do + context 'without replacement' do + specify { expect(opts).not_to include(:replace) } + + specify do + expect(enum.sample(**opts)).to eq(1) + + expect(enum.sample(10, **opts)).to eq([1]) + expect(enum.sample(20, **opts)).to eq([1]) + end + end + + context 'with replacement' do + let(:replace) { true } + + specify { expect(opts).to include(replace: true) } + + specify do + expect(enum.sample(10, **opts)).to eq(Array.new(10, 1)) + expect(enum.sample(20, **opts)).to eq(Array.new(20, 1)) + end + end + end + + context 'without weights' do + specify { expect(opts).not_to include(:weights) } + + include_examples '1-item enumerable' + end + + # TODO: weights + xcontext 'with weights' do + let(:weights) do + { 1 => 1.0 } + end + + specify { expect(opts).to include(weights: weights) } + + include_examples '1-item enumerable' + end + end + + context 'when the receiver has 2 item' do + let(:enum) { 1.upto(2) } + + shared_examples_for 'sample from 2-item enumerable without replacement' do + specify { expect(opts).not_to include(:replace) } + + specify do + expect(Array.new(100) { enum.sample(**opts) }).to all(eq(1).or eq(2)) + + expect(enum.sample(10, **opts)).to contain_exactly(1, 2) + expect(enum.sample(20, **opts)).to contain_exactly(1, 2) + end + end + + context 'without weights' do + context 'without replacement' do + it_behaves_like 'sample from 2-item enumerable without replacement' + end + + context 'with replacement' do + let(:replace) { true } + + specify { expect(opts).to include(replace: true) } + + specify do + expect(enum.sample(10, **opts)).to have_attributes(length: 10).and all(eq(1).or eq(2)) + expect(enum.sample(20, **opts)).to have_attributes(length: 20).and all(eq(1).or eq(2)) + end + end + end + + # TODO: weights + xcontext 'with weights' do + specify { expect(opts).to include(weights: weights) } + + context 'without replacement' do + it_behaves_like 'sample from 2-item enumerable without replacement' + end + + context 'with replacement' do + let(:replace) { true } + + specify { expect(opts).to include(replace: true) } + end + end + end + context 'without weight' do let(:enum) { 1.upto(100000) } @@ -65,7 +166,7 @@ expect(result.length).to eq(n) expect(result.uniq.length).to eq(n) other_results = Array.new(100) { enum.sample(n) } - expect(other_results).not_to be_all {|i| i == result } + expect(other_results).not_to be_all {|i| i == result } end end @@ -79,7 +180,7 @@ expect(result.length).to eq(n) expect(result.uniq.length).to eq(n) other_results = Array.new(100) { enum.sample(n, random: save_random.dup) } - expect(other_results).to be_all {|i| i == result } + expect(other_results).to be_all {|i| i == result } end end end From 1a5a7105cf4960e69788da294e7ca404704e3bf5 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 16 Feb 2018 10:14:55 +0900 Subject: [PATCH 4/6] Reorganize example groups --- spec/enum/sample_spec.rb | 64 +++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/spec/enum/sample_spec.rb b/spec/enum/sample_spec.rb index 1f8bf6e..3959e1a 100644 --- a/spec/enum/sample_spec.rb +++ b/spec/enum/sample_spec.rb @@ -109,9 +109,13 @@ context 'without weight' do let(:enum) { 1.upto(100000) } - context 'without size' do - context 'without rng' do - context 'without weight' do + specify { expect(opts).not_to include(:weights) } + + context 'without replacement' do + specify { expect(opts).not_to include(:replace) } + + context 'without size' do + context 'without rng' do specify do result = enum.sample expect(result).to be_an(Integer) @@ -119,22 +123,20 @@ expect(other_results).not_to be_all {|i| i == result } end end - end - context 'with rng' do - specify do - save_random = random.dup - result = enum.sample(random: random) - expect(result).to be_an(Integer) - other_results = Array.new(100) { enum.sample(random: save_random.dup) } - expect(other_results).to be_all {|i| i == result } - end + context 'with rng' do + specify do + save_random = random.dup + result = enum.sample(random: random) + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample(random: save_random.dup) } + expect(other_results).to be_all {|i| i == result } + end + end end - end - context 'with size (== 1)' do - context 'without rng' do - context 'without weight' do + context 'with size (== 1)' do + context 'without rng' do specify do result = enum.sample(1) expect(result).to be_an(Integer) @@ -142,21 +144,19 @@ expect(other_results).not_to be_all {|i| i == result } end end - end - context 'with rng' do - specify do - save_random = random.dup - result = enum.sample(1, random: random) - expect(result).to be_an(Integer) - other_results = Array.new(100) { enum.sample(1, random: save_random.dup) } - expect(other_results).to be_all {|i| i == result } + context 'with rng' do + specify do + save_random = random.dup + result = enum.sample(1, random: random) + expect(result).to be_an(Integer) + other_results = Array.new(100) { enum.sample(1, random: save_random.dup) } + expect(other_results).to be_all {|i| i == result } + end end end - end - context 'with size (> 1)' do - context 'without replacement' do + context 'with size (> 1)' do context 'without rng' do subject(:result) { enum.sample(n) } @@ -184,10 +184,14 @@ end end end + end - context 'with replacement' do - pending - end + context 'with replacement' do + let(:replace) { true } + + specify { expect(opts).to include(replace: true) } + + pending end end From 727e6077eb2283c78b504ca833e78349a445e34c Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 16 Feb 2018 11:35:06 +0900 Subject: [PATCH 5/6] Write the detail doc of Enumerable#sample [ci skip] --- .../statistics/extension/statistics.c | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/ext/enumerable/statistics/extension/statistics.c b/ext/enumerable/statistics/extension/statistics.c index 849cef3..5bc1afd 100644 --- a/ext/enumerable/statistics/extension/statistics.c +++ b/ext/enumerable/statistics/extension/statistics.c @@ -1516,7 +1516,32 @@ enum_sample_multiple_unweighted(VALUE obj, long size, VALUE random, int replace_ } /* call-seq: - * enum.sample(n=1, random: Random, replace: false) + * enum.sample -> obj + * enum.sample(random: rng) -> obj + * enum.sample(n) -> ary + * enum.sample(n, random: rng) -> ary + * enum.sample(n, random: rng, replace: true) -> ary + * + * Choose a random element or +n+ random elements from the enumerable. + * + * The enumerable is completely scanned just once for choosing random elements + * even if +n+ is ommitted or +n+ is +1+. + * + * +replace:+ keyword specifies whether the sample is with or without + * replacement. + * + * On without-replacement sampling, the elements are chosen by using random + * in order to ensure that an element doesn't repeat itself unless the + * enumerable already contained duplicated elements. + * + * On with-replacement sampling, the elements are chosen by using random, and + * indices into the array can be duplicated even if the enumerable didn't contain + * duplicated elements. + * + * If the enumerable is empty the first two forms return +nil+, and the latter + * forms with +n+ return an empty array. + * + * The optional +rng+ argument will be used as the random number generator. */ static VALUE enum_sample(int argc, VALUE *argv, VALUE obj) From d6a1dc8e0d19928f3a403b64ce8fd8831eed7414 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Fri, 16 Feb 2018 11:51:09 +0900 Subject: [PATCH 6/6] Update doc [ci skip] --- ext/enumerable/statistics/extension/statistics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/enumerable/statistics/extension/statistics.c b/ext/enumerable/statistics/extension/statistics.c index 5bc1afd..4e4c640 100644 --- a/ext/enumerable/statistics/extension/statistics.c +++ b/ext/enumerable/statistics/extension/statistics.c @@ -1525,7 +1525,8 @@ enum_sample_multiple_unweighted(VALUE obj, long size, VALUE random, int replace_ * Choose a random element or +n+ random elements from the enumerable. * * The enumerable is completely scanned just once for choosing random elements - * even if +n+ is ommitted or +n+ is +1+. + * even if +n+ is ommitted or +n+ is +1+. This means this method cannot be + * applicable to an infinite enumerable. * * +replace:+ keyword specifies whether the sample is with or without * replacement.