diff --git a/algorithms.py b/algorithms.py index 9b80f9d39..b3a8f51fd 100644 --- a/algorithms.py +++ b/algorithms.py @@ -2,6 +2,7 @@ Python version of the simulation algorithm. """ import argparse +import dataclasses import heapq import logging import math @@ -385,55 +386,34 @@ def num_lineages(self): return sum(len(s) for s in self.segments) -class TrajectorySimulator: - """ - Class to simulate an allele frequency trajectory on which to condition - the coalescent simulation. - """ +class Trajectory: + def next_frequency(self, x, dt, current_size, rand): + """ + Compute the next allele frequency in the trajectory given the + current value x and population size after time dt using the + specified random value betweeen 0 and 1. + """ + raise NotImplementedError() # pragma: no cover - def __init__(self, initial_freq, end_freq, alpha, time_slice): - self._initial_freq = initial_freq - self._end_freq = end_freq - self._alpha = alpha - self._time_slice = time_slice - self._reset() - def _reset(self): - self._allele_freqs = [] - self._times = [] +@dataclasses.dataclass() +class GenicSelectionTrajectory: + alpha: float - def _genic_selection_stochastic_forwards(self, dt, freq, alpha): - ux = (alpha * freq * (1 - freq)) / np.tanh(alpha * freq) - sign = 1 if random.random() < 0.5 else -1 - freq += (ux * dt) + sign * np.sqrt(freq * (1.0 - freq) * dt) - return freq + def next_frequency(self, x, dt, current_size, rand): + alpha = current_size * self.alpha + ux = (alpha * x * (1 - x)) / np.tanh(alpha * x) + sign = 1 if rand < 0.5 else -1 + return x + (ux * dt) + sign * np.sqrt(x * (1.0 - x) * dt) - def _simulate(self): - """ - Proposes a sweep trajectory and returns the acceptance probability. - """ - x = self._end_freq # backward time - current_size = 1 - t_inc = self._time_slice - t = 0 - while x > self._initial_freq: - self._allele_freqs.append(max(x, self._initial_freq)) - self._times.append(t) - # just a note below - # current_size = self._size_calculator(t) - # - x = 1.0 - self._genic_selection_stochastic_forwards( - t_inc, 1.0 - x, self._alpha * current_size - ) - t += self._time_slice - # will want to return current_size / N_max - # for prototype this always equals 1 - return 1 - def run(self): - while random.random() > self._simulate(): - self.reset() - return self._allele_freqs, self._times +@dataclasses.dataclass() +class Sweep: + position: float + initial_frequency: float + end_frequency: float + dt: float + trajectory: Trajectory class RateMap: @@ -587,9 +567,8 @@ def __init__( model="hudson", max_segments=100, num_labels=1, - sweep_trajectory=None, + sweep=None, full_arg=False, - time_slice=None, gene_conversion_rate=0.0, gene_conversion_length=1, discrete_genome=True, @@ -615,6 +594,7 @@ def __init__( self.num_labels = num_labels self.num_populations = N self.max_segments = max_segments + self.sweep = sweep self.full_arg = full_arg self.pedigree = None self.segment_stack = [] @@ -648,11 +628,6 @@ def __init__( self.num_re_events = 0 self.num_gc_events = 0 - # Sweep variables - self.sweep_site = (self.L // 2) - 1 # need to add options here - self.sweep_trajectory = sweep_trajectory - self.time_slice = time_slice - self.modifier_events = [(sys.float_info.max, None, None)] for time, pop_id, new_size in population_size_changes: self.modifier_events.append( @@ -989,13 +964,41 @@ def hudson_simulate(self, end_time): assert non_empty_pops == X return self.finalise() - def single_sweep_simulate(self): + def propose_trajectory(self): + """ + Proposes a sweep trajectory and returns the acceptance probability. """ - Does a structed coalescent until end_freq is reached, using - information in self.weep_trajectory. + allele_freqs = [] + times = [] + sweep = self.sweep + x = sweep.end_frequency + current_size = 1 + t = 0 + while x > sweep.initial_frequency: + allele_freqs.append(max(x, sweep.initial_frequency)) + times.append(t) + # just a note below + # current_size = self._size_calculator(t) + # + x = 1 - sweep.trajectory.next_frequency( + 1 - x, sweep.dt, current_size, random.random() + ) + t += sweep.dt + # will want to return current_size / N_max + # for prototype this always equals 1 + return allele_freqs, times, 1 + + def simulate_sweep_trajectory(self): + allele_freqs, times, acceptance = self.propose_trajectory() + while random.random() > acceptance: + allele_freqs, times, acceptance = self.propose_trajectory() + return allele_freqs, times + def single_sweep_simulate(self): + """ + Does a structed coalescent until end_freq is reached. """ - allele_freqs, times = self.sweep_trajectory + allele_freqs, times = self.simulate_sweep_trajectory() sweep_traj_step = 0 x = allele_freqs[sweep_traj_step] @@ -1018,7 +1021,7 @@ def single_sweep_simulate(self): self.P[0].add(tmp, 1) # main loop time - t_inc_orig = self.time_slice + t_inc_orig = self.sweep.dt e_time = 0.0 while self.ancestors_remain() and sweep_traj_step < len(times) - 1: self.verify() @@ -1083,12 +1086,12 @@ def single_sweep_simulate(self): if r < e_sum / sweep_pop_tot_rate: # recomb in B self.hudson_recombination_event_sweep_phase( - 1, self.sweep_site, x + 1, self.sweep.position, x ) else: # recomb in b self.hudson_recombination_event_sweep_phase( - 0, self.sweep_site, 1.0 - x + 0, self.sweep.position, 1.0 - x ) # clean up the labels at end for idx, u in enumerate(self.P[0].iter_label(1)): @@ -2163,16 +2166,22 @@ def run_simulate(args): rates = args.recomb_rates recombination_map = RateMap(positions, rates) num_labels = 1 - sweep_trajectory = None + sweep = None if args.model == "single_sweep": if num_populations > 1: raise ValueError("Multiple populations not currently supported") # Compute the trajectory if args.trajectory is None: raise ValueError("Must provide trajectory (init_freq, end_freq, alpha)") - init_freq, end_freq, alpha = args.trajectory - traj_sim = TrajectorySimulator(init_freq, end_freq, alpha, args.time_slice) - sweep_trajectory = traj_sim.run() + initial_frequency, end_frequency, alpha = args.trajectory + trajectory = GenicSelectionTrajectory(alpha) + sweep = Sweep( + position=m // 2 + 1, + initial_frequency=initial_frequency, + end_frequency=end_frequency, + dt=args.time_slice, + trajectory=trajectory, + ) num_labels = 2 random.seed(args.random_seed) np.random.seed(args.random_seed + 1) @@ -2204,8 +2213,7 @@ def run_simulate(args): max_segments=100000, num_labels=num_labels, full_arg=args.full_arg, - sweep_trajectory=sweep_trajectory, - time_slice=args.time_slice, + sweep=sweep, gene_conversion_rate=gc_rate, gene_conversion_length=mean_tract_length, discrete_genome=args.discrete, diff --git a/lib/msprime.c b/lib/msprime.c index 054725bac..b0473d053 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -895,6 +895,8 @@ msp_free(msp_t *self) msp_safe_free(self->buffered_edges); msp_safe_free(self->root_segments); msp_safe_free(self->initial_overlaps); + msp_safe_free(self->trajectory.time); + msp_safe_free(self->trajectory.allele_frequency); /* free the object heaps */ object_heap_free(&self->avl_node_heap); object_heap_free(&self->node_mapping_heap); @@ -1532,6 +1534,27 @@ msp_print_initial_overlaps(msp_t *self, FILE *out) fprintf(out, "\t%f -> %d\n", overlap->left, overlap->count); } +static void +msp_sweep_print_state(msp_t *self, FILE *out) +{ + const sweep_t *sweep = &self->model.params.sweep; + const sweep_trajectory_t *trajectory = &self->trajectory; + tsk_size_t j; + + fprintf(out, "Sweep\n"); + fprintf(out, "\tposition=%f\n", sweep->position); + fprintf(out, "\tstart_frequency=%f\n", sweep->start_frequency); + fprintf(out, "\tend_frequency=%f\n", sweep->end_frequency); + fprintf(out, "\tdt=%f\n", sweep->dt); + fprintf(out, "\ts=%f\n", sweep->s); + fprintf(out, "\tTrajectory (size=%d, max=%d)\n", (int) trajectory->num_steps, + (int) trajectory->max_steps); + for (j = 0; j < trajectory->num_steps; j++) { + fprintf(out, "\t\t%.14f\t%.14f\n", trajectory->time[j], + trajectory->allele_frequency[j]); + } +} + int msp_print_state(msp_t *self, FILE *out) { @@ -1564,8 +1587,7 @@ msp_print_state(msp_t *self, FILE *out) self->model.params.dirac_coalescent.psi, self->model.params.dirac_coalescent.c); } else if (self->model.type == MSP_MODEL_SWEEP) { - fprintf(out, "\tsweep @ locus = %f\n", self->model.params.sweep.position); - self->model.params.sweep.print_state(&self->model.params.sweep, out); + msp_sweep_print_state(self, out); } fprintf(out, "L = %.14g\n", self->sequence_length); fprintf(out, "discrete_genome = %d\n", self->discrete_genome); @@ -4933,20 +4955,131 @@ msp_sweep_recombination_event( return ret; } +static int +msp_alloc_sweep_trajectory(msp_t *self) +{ + int ret = 0; + sweep_trajectory_t *trajectory = &self->trajectory; + + if (trajectory->max_steps == 0) { + trajectory->max_steps = 64; + trajectory->time = malloc(trajectory->max_steps * sizeof(*trajectory->time)); + trajectory->allele_frequency + = malloc(trajectory->max_steps * sizeof(*trajectory->allele_frequency)); + if (trajectory->time == NULL || trajectory->allele_frequency == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + } +out: + return ret; +} + +static int +msp_expand_sweep_trajectory(msp_t *self) +{ + int ret = 0; + void *tmp; + sweep_trajectory_t *trajectory = &self->trajectory; + + if (trajectory->num_steps + 1 == trajectory->max_steps) { + trajectory->max_steps *= 2; + tmp = realloc( + trajectory->time, trajectory->max_steps * sizeof(*trajectory->time)); + if (tmp == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + trajectory->time = tmp; + tmp = realloc(trajectory->allele_frequency, + trajectory->max_steps * sizeof(*trajectory->allele_frequency)); + if (tmp == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + trajectory->allele_frequency = tmp; + } +out: + return ret; +} + +static int +msp_sweep_generate_trajectory(msp_t *self) +{ + int ret; + /* double x, t, *tmp, pop_size, sim_time, alpha; */ + sweep_trajectory_t *trajectory = &self->trajectory; + const sweep_t *sweep = &self->model.params.sweep; + const double dt = sweep->dt; + const double start_frequency = sweep->start_frequency; + const double end_frequency = sweep->end_frequency; + double x, y, u, t, pop_size, sim_time; + + ret = msp_alloc_sweep_trajectory(self); + if (ret != 0) { + goto out; + } + + /* TODO Wrap this in a rejection sample loop */ + + t = 0; + x = end_frequency; + /* TODO not clear why we have two ways of measuring time here. */ + sim_time = self->time; /*time in generations*/ + trajectory->num_steps = 0; + /* TODO see note below about assigning to this in muliple places */ + trajectory->time[trajectory->num_steps] = t; + trajectory->allele_frequency[trajectory->num_steps] = x; + trajectory->num_steps++; + + while (x > start_frequency) { + ret = msp_expand_sweep_trajectory(self); + if (ret != 0) { + goto out; + } + pop_size = get_population_size(&self->populations[0], sim_time); + u = gsl_rng_uniform(self->rng); + y = sweep->next_frequency(1 - x, dt, pop_size, u, sweep->trajectory_params); + x = 1.0 - y; + if (x < 0 || x > 1) { + ret = MSP_ERR_BAD_TRAJECTORY; + goto out; + } + + t += dt; + sim_time += dt * pop_size * self->ploidy; + + if (x > start_frequency) { + /* TODO it would be nice to refactor here so that this + * condition was checked only once per loop. We could + * have a break in the middle, or else maybe restructure + * a bit ? We're assigning values in three places at the + * moment, which is surely not necessary. */ + trajectory->allele_frequency[trajectory->num_steps] = x; + trajectory->time[trajectory->num_steps] = t; + trajectory->num_steps++; + } + } + tsk_bug_assert(trajectory->num_steps < trajectory->max_steps); + trajectory->time[trajectory->num_steps] = t; + trajectory->allele_frequency[trajectory->num_steps] = start_frequency; + trajectory->num_steps++; +out: + return ret; +} + static int msp_run_sweep(msp_t *self) { int ret = 0; simulation_model_t *model = &self->model; - size_t curr_step = 1; - size_t num_steps; - double *allele_frequency = NULL; - double *time = NULL; + size_t curr_step; double sweep_locus = model->params.sweep.position; - double sweep_dt; + double sweep_dt = model->params.sweep.dt; + population_t *population; + sweep_trajectory_t *trajectory = &self->trajectory; size_t j = 0; double recomb_mass; - unsigned long events = 0; label_id_t label; double rec_rates[] = { 0.0, 0.0 }; double sweep_pop_sizes[] = { 0.0, 0.0 }; @@ -4978,53 +5111,50 @@ msp_run_sweep(msp_t *self) * depending on the value of curr_step, and hopefully reintroduce the max_time * and max_steps parameters. */ - ret = model->params.sweep.generate_trajectory( - &model->params.sweep, self, &num_steps, &time, &allele_frequency); - - t_start = self->time; - sweep_dt = model->params.sweep.trajectory_params.genic_selection_trajectory.dt; - tsk_bug_assert(sweep_dt > 0); + ret = msp_sweep_generate_trajectory(self); if (ret != 0) { goto out; } - ret = msp_sweep_initialise(self, allele_frequency[0]); + ret = msp_sweep_initialise(self, trajectory->allele_frequency[0]); if (ret != 0) { goto out; } + /* TODO the population ID should be a parameter of the sweep */ + population = &self->populations[0]; + tsk_bug_assert(sweep_dt > 0); + t_start = self->time; curr_step = 1; - while (msp_get_num_ancestors(self) > 0 && curr_step < num_steps) { - events++; + while (msp_get_num_ancestors(self) > 0 && curr_step < trajectory->num_steps) { /* Set pop sizes & rec_rates */ for (j = 0; j < self->num_labels; j++) { label = (label_id_t) j; recomb_mass = self->recomb_mass_index == NULL ? 0 : fenwick_get_total(&self->recomb_mass_index[label]); - sweep_pop_sizes[j] = avl_count(&self->populations[0].ancestors[label]); + sweep_pop_sizes[j] = avl_count(&population->ancestors[label]); rec_rates[j] = recomb_mass; } event_prob = 1.0; event_rand = gsl_rng_uniform(self->rng); sweep_over = false; - while (event_prob > event_rand && curr_step < num_steps && !sweep_over) { - pop_size = get_population_size(&self->populations[0], self->time); + while (event_prob > event_rand && curr_step < trajectory->num_steps + && !sweep_over) { + pop_size = get_population_size(population, self->time); p_coal_B = 0; - if (avl_count(&self->populations[0].ancestors[1]) > 1) { + if (avl_count(&population->ancestors[1]) > 1) { p_coal_B = ((sweep_pop_sizes[1] * (sweep_pop_sizes[1] - 1)) * 0.5) - / allele_frequency[curr_step] * sweep_dt; + / trajectory->allele_frequency[curr_step] * sweep_dt; } p_coal_b = 0; - if (avl_count(&self->populations[0].ancestors[0]) > 1) { + if (avl_count(&population->ancestors[0]) > 1) { p_coal_b = ((sweep_pop_sizes[0] * (sweep_pop_sizes[0] - 1)) * 0.5) - / (1.0 - allele_frequency[curr_step]) * sweep_dt; + / (1.0 - trajectory->allele_frequency[curr_step]) * sweep_dt; } p_rec_b = rec_rates[0] * pop_size * self->ploidy * sweep_dt; p_rec_B = rec_rates[1] * pop_size * self->ploidy * sweep_dt; sweep_pop_tot_rate = p_coal_b + p_coal_B + p_rec_b + p_rec_B; - /* doing this to build in generality if we want >1 pop */ - total_rate = sweep_pop_tot_rate; /* debug prints below printf("pop_size: %g sweep_dt: %g sweep_pop_sizes[1]: %g sweep_pop_sizes[1]: @@ -5048,8 +5178,8 @@ msp_run_sweep(msp_t *self) e_sum = p_coal_b; /* convert time scale */ - pop_size = get_population_size(&self->populations[0], self->time); - t_unscaled = time[curr_step - 1] * self->ploidy * pop_size; + pop_size = get_population_size(population, self->time); + t_unscaled = trajectory->time[curr_step - 1] * self->ploidy * pop_size; tsk_bug_assert(t_unscaled > 0); self->time = t_start + t_unscaled; /* printf("event time: %g\n", self->time); */ @@ -5065,12 +5195,12 @@ msp_run_sweep(msp_t *self) e_sum += p_rec_b; if (tmp_rand < e_sum / sweep_pop_tot_rate) { /* recomb in b background */ - ret = msp_sweep_recombination_event( - self, 0, sweep_locus, (1.0 - allele_frequency[curr_step - 1])); + ret = msp_sweep_recombination_event(self, 0, sweep_locus, + (1.0 - trajectory->allele_frequency[curr_step - 1])); } else { /* recomb in B background */ - ret = msp_sweep_recombination_event( - self, 1, sweep_locus, allele_frequency[curr_step - 1]); + ret = msp_sweep_recombination_event(self, 1, sweep_locus, + trajectory->allele_frequency[curr_step - 1]); } } } @@ -5106,8 +5236,6 @@ msp_run_sweep(msp_t *self) } ret = MSP_EXIT_MODEL_COMPLETE; out: - msp_safe_free(time); - msp_safe_free(allele_frequency); return ret; } @@ -7026,110 +7154,24 @@ msp_beta_common_ancestor_event(msp_t *self, population_id_t pop_id, label_id_t l } /************************************************************** - * Allele frequency trajectory simulation for genic selection - * + * Allele frequency trajectory simulation for sweep models **************************************************************/ static double -genic_selection_stochastic_forwards(double dt, double freq, double alpha, double u) +next_frequency_genic_selection_stochastic( + double freq, double dt, double pop_size, double u, void *params) { + sweep_t *sweep = (sweep_t *) params; + double s = sweep->s; /* this is scaled following Ewens chapter 5 e.g., * w_11=1+s; w_12=1+s/2; w_22=1; that is h=0.5 */ + /* FIXME where should this 2 be? Is it really the ploidy value? */ + double alpha = 2 * pop_size * s; double ux = ((alpha / 2.0) * freq * (1 - freq)) / tanh((alpha / 2.0) * freq); int sign = u < 0.5 ? 1 : -1; return freq + (ux * dt) + sign * sqrt(freq * (1.0 - freq) * dt); } -static int -genic_selection_generate_trajectory(sweep_t *self, msp_t *simulator, - size_t *ret_num_steps, double **ret_time, double **ret_allele_frequency) -{ - int ret = 0; - genic_selection_trajectory_t trajectory - = self->trajectory_params.genic_selection_trajectory; - gsl_rng *rng = simulator->rng; - size_t max_steps = 64; - double *time = malloc(max_steps * sizeof(*time)); - double *allele_frequency = malloc(max_steps * sizeof(*allele_frequency)); - double x, t, *tmp, pop_size, sim_time, alpha; - size_t num_steps; - - if (time == NULL || allele_frequency == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - /* TODO Wrap this in a rejection sample loop and get the population size - * from the simulator. We can use - * pop_size = get_population_size(sim->populations[0], time); - * to do this because we assume there are no demographic events - * during a sweep */ - - x = trajectory.end_frequency; - num_steps = 0; - t = 0; - sim_time = simulator->time; /*time in generations*/ - time[num_steps] = t; - allele_frequency[num_steps] = x; - num_steps++; - while (x > trajectory.start_frequency) { - if (num_steps + 1 >= max_steps) { - max_steps *= 2; - tmp = realloc(time, max_steps * sizeof(*time)); - if (tmp == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - time = tmp; - tmp = realloc(allele_frequency, max_steps * sizeof(*allele_frequency)); - if (tmp == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - allele_frequency = tmp; - } - pop_size = get_population_size(&simulator->populations[0], sim_time); - alpha = 2 * pop_size * trajectory.s; - x = 1.0 - - genic_selection_stochastic_forwards( - trajectory.dt, 1.0 - x, alpha, gsl_rng_uniform(rng)); - /* need our recored traj to stay in bounds */ - t += trajectory.dt; - sim_time += trajectory.dt * pop_size * simulator->ploidy; - if (x > trajectory.start_frequency) { - allele_frequency[num_steps] = x; - time[num_steps] = t; - num_steps++; - } - } - tsk_bug_assert(num_steps < max_steps); /* num_steps + 1 above guarantees this */ - time[num_steps] = t; - allele_frequency[num_steps] = trajectory.start_frequency; - num_steps++; - - *ret_num_steps = num_steps; - *ret_time = time; - *ret_allele_frequency = allele_frequency; - time = NULL; - allele_frequency = NULL; -out: - msp_safe_free(time); - msp_safe_free(allele_frequency); - return ret; -} - -static void -genic_selection_print_state(sweep_t *self, FILE *out) -{ - genic_selection_trajectory_t *trajectory - = &self->trajectory_params.genic_selection_trajectory; - - fprintf(out, "\tGenic selection trajectory\n"); - fprintf(out, "\t\tstart_frequency = %f\n", trajectory->start_frequency); - fprintf(out, "\t\tend_frequency = %f\n", trajectory->end_frequency); - fprintf(out, "\t\ts = %f\n", trajectory->s); - fprintf(out, "\t\tdt = %f\n", trajectory->dt); -} - /************************************************************** * Public API for setting simulation models. **************************************************************/ @@ -7322,13 +7364,12 @@ msp_set_simulation_model_beta(msp_t *self, double alpha, double truncation_point } int -msp_set_simulation_model_sweep_genic_selection(msp_t *self, double position, - double start_frequency, double end_frequency, double s, double dt) +msp_set_simulation_model_sweep(msp_t *self, double position, double start_frequency, + double end_frequency, double dt, next_frequency_func_t next_frequency, + void *trajectory_params) { int ret = 0; - simulation_model_t *model = &self->model; - genic_selection_trajectory_t *trajectory - = &model->params.sweep.trajectory_params.genic_selection_trajectory; + sweep_t *sweep = &self->model.params.sweep; double L = self->sequence_length; /* Check the inputs to make sure they make sense */ @@ -7349,22 +7390,34 @@ msp_set_simulation_model_sweep_genic_selection(msp_t *self, double position, ret = MSP_ERR_BAD_TIME_DELTA; goto out; } - if (s <= 0) { - ret = MSP_ERR_BAD_SWEEP_GENIC_SELECTION_S; + ret = msp_set_simulation_model(self, MSP_MODEL_SWEEP); + if (ret != 0) { goto out; } + sweep->position = position; + sweep->start_frequency = start_frequency; + sweep->end_frequency = end_frequency; + sweep->dt = dt; + sweep->next_frequency = next_frequency; + sweep->trajectory_params = trajectory_params; +out: + return ret; +} - ret = msp_set_simulation_model(self, MSP_MODEL_SWEEP); - if (ret != 0) { +int +msp_set_simulation_model_sweep_genic_selection(msp_t *self, double position, + double start_frequency, double end_frequency, double s, double dt) +{ + int ret = 0; + sweep_t *sweep = &self->model.params.sweep; + + if (s <= 0) { + ret = MSP_ERR_BAD_SWEEP_GENIC_SELECTION_S; goto out; } - model->params.sweep.position = position; - model->params.sweep.generate_trajectory = genic_selection_generate_trajectory; - model->params.sweep.print_state = genic_selection_print_state; - trajectory->start_frequency = start_frequency; - trajectory->end_frequency = end_frequency; - trajectory->s = s; - trajectory->dt = dt; + sweep->s = s; + ret = msp_set_simulation_model_sweep(self, position, start_frequency, end_frequency, + dt, next_frequency_genic_selection_stochastic, sweep); out: return ret; } diff --git a/lib/msprime.h b/lib/msprime.h index 2807d92a4..770979c1b 100644 --- a/lib/msprime.h +++ b/lib/msprime.h @@ -158,23 +158,36 @@ typedef struct { /* Forward declaration */ struct _msp_t; +/* A generated trajectory realisation */ typedef struct { - /* TODO document these parameters.*/ - double start_frequency; - double end_frequency; - double s; - double dt; -} genic_selection_trajectory_t; + tsk_size_t num_steps; + tsk_size_t max_steps; + double *time; + double *allele_frequency; +} sweep_trajectory_t; + +typedef double (*next_frequency_func_t)( + double x, double dt, double pop_size, double rand, void *params); typedef struct _sweep_t { double position; - union { - /* Future trajectory simulation models would go here */ - genic_selection_trajectory_t genic_selection_trajectory; - } trajectory_params; - int (*generate_trajectory)(struct _sweep_t *self, struct _msp_t *simulator, - size_t *num_steps, double **time, double **allele_frequency); - void (*print_state)(struct _sweep_t *self, FILE *out); + double start_frequency; + double end_frequency; + /* JK: Question for ADK: why isn't dt in units of generations? This would seem + * less confusing overall, since we're converting back into generations during + * the simulation anyway. We could also default it to 1 then, rather than + * requiring users to scale it by N (which is different to every other parameter + * we have */ + double dt; + void *trajectory_params; + next_frequency_func_t next_frequency; + /* double (*next_frequency)( */ + /* double x, double dt, double pop_size, double rand, void *params); */ + /* For simplicity store the paramters here for now. If things get complicated + * and some models have lots of different parameters we can put in a + * union which we manipulate to make sure there's always enough space + * to store the parameters. */ + double s; } sweep_t; typedef struct _simulation_model_t { @@ -262,6 +275,7 @@ typedef struct _msp_t { tsk_edge_t *buffered_edges; tsk_size_t num_buffered_edges; tsk_size_t max_buffered_edges; + sweep_trajectory_t trajectory; /* Methods for getting the waiting time until the next common ancestor * event and the event are defined by the simulation model */ double (*get_common_ancestor_waiting_time)( @@ -427,6 +441,9 @@ int msp_set_simulation_model_dtwf(msp_t *self); int msp_set_simulation_model_wf_ped(msp_t *self); int msp_set_simulation_model_dirac(msp_t *self, double psi, double c); int msp_set_simulation_model_beta(msp_t *self, double alpha, double truncation_point); +int msp_set_simulation_model_sweep(msp_t *self, double position, double start_frequency, + double end_frequency, double dt, next_frequency_func_t next_frequency, + void *trajectory_params); int msp_set_simulation_model_sweep_genic_selection(msp_t *self, double position, double start_frequency, double end_frequency, double s, double dt); diff --git a/lib/tests/test_sweeps.c b/lib/tests/test_sweeps.c index aa3f149d2..243501f2e 100644 --- a/lib/tests/test_sweeps.c +++ b/lib/tests/test_sweeps.c @@ -28,36 +28,36 @@ verify_simple_genic_selection_trajectory( gsl_rng *rng = safe_rng_alloc(); sample_t samples[] = { { 0, 0.0 }, { 0, 0.0 } }; tsk_table_collection_t tables; - size_t j, num_steps; - double *allele_frequency, *time; + size_t j; ret = build_sim(&msp, &tables, rng, 1.0, 1, samples, 2); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = msp_set_simulation_model_sweep_genic_selection( &msp, 0.5, start_frequency, end_frequency, alpha, dt); CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_set_num_labels(&msp, 2); + CU_ASSERT_EQUAL_FATAL(ret, 0); ret = msp_initialise(&msp); CU_ASSERT_EQUAL_FATAL(ret, 0); /* compute the trajectory */ - ret = msp.model.params.sweep.generate_trajectory( - &msp.model.params.sweep, &msp, &num_steps, &time, &allele_frequency); - CU_ASSERT_EQUAL_FATAL(ret, 0); - CU_ASSERT_FATAL(num_steps > 1); - CU_ASSERT_EQUAL(time[0], 0); - CU_ASSERT_EQUAL(allele_frequency[0], end_frequency); - CU_ASSERT_TRUE(allele_frequency[num_steps - 1] == start_frequency); - - for (j = 0; j < num_steps; j++) { - CU_ASSERT_TRUE(allele_frequency[j] >= 0); - CU_ASSERT_TRUE(allele_frequency[j] <= 1); + ret = msp_run(&msp, DBL_MAX, UINT32_MAX); + CU_ASSERT_FATAL(ret >= 0); + CU_ASSERT_FATAL(msp.trajectory.num_steps > 1); + CU_ASSERT_EQUAL(msp.trajectory.time[0], 0); + CU_ASSERT_EQUAL(msp.trajectory.allele_frequency[0], end_frequency); + CU_ASSERT_TRUE(msp.trajectory.allele_frequency[msp.trajectory.num_steps - 1] + == start_frequency); + + for (j = 0; j < msp.trajectory.num_steps; j++) { + CU_ASSERT_TRUE(msp.trajectory.allele_frequency[j] >= 0); + CU_ASSERT_TRUE(msp.trajectory.allele_frequency[j] <= 1); if (j > 0) { - CU_ASSERT_DOUBLE_EQUAL_FATAL(time[j], time[j - 1] + dt, 1e-9); + CU_ASSERT_DOUBLE_EQUAL_FATAL( + msp.trajectory.time[j], msp.trajectory.time[j - 1] + dt, 1e-9); } } - free(time); - free(allele_frequency); msp_free(&msp); tsk_table_collection_free(&tables); gsl_rng_free(rng); @@ -73,6 +73,96 @@ test_genic_selection_trajectory(void) verify_simple_genic_selection_trajectory(0.1, 0.4, 50, 0.0001); } +static double +next_frequency_linear(double freq, double dt, double pop_size, double u, void *params) +{ + return freq + dt; +} + +static void +test_linear_trajectory(void) +{ + int ret; + msp_t msp; + gsl_rng *rng = safe_rng_alloc(); + sample_t samples[] = { { 0, 0.0 }, { 0, 0.0 } }; + tsk_table_collection_t tables; + size_t j; + double x, t; + double dt = 0.125; + double start_frequency = 0.125; + double end_frequency = 0.875; + + ret = build_sim(&msp, &tables, rng, 1.0, 1, samples, 2); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_set_simulation_model_sweep( + &msp, 0.5, start_frequency, end_frequency, dt, next_frequency_linear, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_set_num_labels(&msp, 2); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_initialise(&msp); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* compute the trajectory */ + ret = msp_run(&msp, DBL_MAX, UINT32_MAX); + CU_ASSERT_FATAL(ret >= 0); + CU_ASSERT_FATAL(msp.trajectory.num_steps > 1); + CU_ASSERT_EQUAL(msp.trajectory.time[0], 0); + CU_ASSERT_EQUAL(msp.trajectory.allele_frequency[0], end_frequency); + CU_ASSERT_TRUE(msp.trajectory.allele_frequency[msp.trajectory.num_steps - 1] + == start_frequency); + + x = end_frequency; + t = 0; + for (j = 0; j < msp.trajectory.num_steps; j++) { + CU_ASSERT_TRUE(msp.trajectory.allele_frequency[j] == x); + CU_ASSERT_TRUE(msp.trajectory.time[j] == t); + t += dt; + x -= dt; + } + + msp_free(&msp); + tsk_table_collection_free(&tables); + gsl_rng_free(rng); +} + +static double +next_frequency_negative(double freq, double dt, double pop_size, double u, void *params) +{ + return -1 * freq; +} + +static void +test_bad_trajectory(void) +{ + int ret; + msp_t msp; + gsl_rng *rng = safe_rng_alloc(); + sample_t samples[] = { { 0, 0.0 }, { 0, 0.0 } }; + tsk_table_collection_t tables; + double dt = 0.125; + double start_frequency = 0.125; + double end_frequency = 0.875; + + ret = build_sim(&msp, &tables, rng, 1.0, 1, samples, 2); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_set_simulation_model_sweep( + &msp, 0.5, start_frequency, end_frequency, dt, next_frequency_negative, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_set_num_labels(&msp, 2); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = msp_initialise(&msp); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* compute the trajectory */ + ret = msp_run(&msp, DBL_MAX, UINT32_MAX); + CU_ASSERT_EQUAL_FATAL(ret, MSP_ERR_BAD_TRAJECTORY); + + msp_free(&msp); + tsk_table_collection_free(&tables); + gsl_rng_free(rng); +} + static void test_sweep_genic_selection_bad_parameters(void) { @@ -157,6 +247,7 @@ test_sweep_genic_selection_events(void) CU_ASSERT_EQUAL(ret, 0); ret = msp_run(&msp, DBL_MAX, UINT32_MAX); CU_ASSERT_EQUAL(ret, MSP_ERR_EVENTS_DURING_SWEEP); + msp_print_state(&msp, _devnull); msp_free(&msp); tsk_table_collection_free(&tables); @@ -201,7 +292,6 @@ verify_sweep_genic_selection(double sequence_length, double s) CU_ASSERT_EQUAL(ret, 0); ret = msp_run(&msp, DBL_MAX, UINT32_MAX); CU_ASSERT_TRUE(ret >= 0); - msp_print_state(&msp, _devnull); ret = msp_finalise_tables(&msp); CU_ASSERT_EQUAL(ret, 0); msp_free(&msp); @@ -383,6 +473,8 @@ main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_genic_selection_trajectory", test_genic_selection_trajectory }, + { "test_linear_trajectory", test_linear_trajectory }, + { "test_bad_trajectory", test_bad_trajectory }, { "test_sweep_genic_selection_bad_parameters", test_sweep_genic_selection_bad_parameters }, { "test_sweep_genic_selection_events", test_sweep_genic_selection_events }, diff --git a/lib/util.c b/lib/util.c index c537b8ac5..467458750 100644 --- a/lib/util.c +++ b/lib/util.c @@ -135,6 +135,9 @@ msp_strerror_internal(int err) case MSP_ERR_BAD_TRAJECTORY_START_END: ret = "Start frequency must be less than end frequency"; break; + case MSP_ERR_BAD_TRAJECTORY: + ret = "Computed trajectory outside valid bounds 0 <= x <= 1"; + break; case MSP_ERR_BAD_SWEEP_GENIC_SELECTION_S: ret = "alpha must be > 0"; break; diff --git a/lib/util.h b/lib/util.h index dd70c4066..91f3af48a 100644 --- a/lib/util.h +++ b/lib/util.h @@ -124,6 +124,7 @@ #define MSP_ERR_POPULATION_CURRENTLY_ACTIVE -80 #define MSP_ERR_DEACTIVATE_INACTIVE_POPULATION -81 #define MSP_ERR_DEACTIVATE_PREVIOUSLY_ACTIVE_POPULATION -82 +#define MSP_ERR_BAD_TRAJECTORY -83 /* clang-format on */ /* This bit is 0 for any errors originating from tskit */