|
| MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const std::vector< gbwtgraph::DefaultMinimizerIndex * > &minimizer_indexes, MinimumDistanceIndex &distance_index, const PathPositionHandleGraph *path_graph=nullptr) |
|
void | map (Alignment &aln, AlignmentEmitter &alignment_emitter) |
|
vector< Alignment > | map (Alignment &aln) |
|
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2, vector< pair< Alignment, Alignment >> &ambiguous_pair_buffer) |
|
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2) |
|
bool | fragment_distr_is_finalized () |
|
void | finalize_fragment_length_distr () |
|
void | force_fragment_length_distr (double mean, double stdev) |
|
double | get_fragment_length_mean () const |
|
double | get_fragment_length_stdev () const |
|
size_t | get_fragment_length_sample_size () const |
|
size_t | get_distance_limit (size_t read_length) const |
|
void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
| Set all the aligner scoring parameters and create the stored aligner instances. More...
|
|
void | set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
|
void | set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
|
|
std::vector< Minimizer > | find_minimizers (const std::string &sequence, Funnel &funnel) const |
|
std::vector< Seed > | find_seeds (const std::vector< Minimizer > &minimizers, const Alignment &aln, Funnel &funnel) const |
|
void | score_cluster (Cluster &cluster, size_t i, const std::vector< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t seq_length, Funnel &funnel) const |
|
std::vector< int > | score_extensions (const std::vector< std::vector< GaplessExtension >> &extensions, const Alignment &aln, Funnel &funnel) const |
|
std::vector< int > | score_extensions (const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &extensions, const Alignment &aln, Funnel &funnel) const |
|
void | attempt_rescue (const Alignment &aligned_read, Alignment &rescued_alignment, const std::vector< Minimizer > &minimizers, bool rescue_forward) |
|
GaplessExtender::cluster_type | seeds_in_subgraph (const std::vector< Minimizer > &minimizers, const std::unordered_set< id_t > &subgraph) const |
|
void | fix_dozeu_score (Alignment &rescued_alignment, const HandleGraph &rescue_graph, const std::vector< handle_t > &topological_order) const |
|
int64_t | distance_between (const Alignment &aln1, const Alignment &aln2) |
|
void | extension_to_alignment (const GaplessExtension &extension, Alignment &alignment) const |
|
double | compute_mapq_caps (const Alignment &aln, const std::vector< Minimizer > &minimizers, const SmallBitset &explored) |
|
void | find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, Alignment &best, Alignment &second_best) const |
|
unordered_map< size_t, unordered_map< size_t, vector< Path > > > | find_connecting_paths (const vector< GaplessExtension > &extended_seeds, size_t read_length) const |
|
vector< TreeSubgraph > | get_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails, size_t *longest_detectable_gap=nullptr) const |
|
pair< Path, size_t > | get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left, size_t longest_detectable_gap) const |
|
void | dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
|
void | dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
|
void | dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
|
template<typename Item , typename Score = double> |
void | process_until_threshold_a (const vector< Item > &items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
|
template<typename Item , typename Score = double> |
void | process_until_threshold_b (const vector< Item > &items, const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
|
template<typename Item , typename Score = double> |
void | process_until_threshold_c (const vector< Item > &items, const function< Score(size_t)> &get_score, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
|
| AlignerClient (double gc_content_estimate=vg::default_gc_content) |
|
const GSSWAligner * | get_aligner (bool have_qualities=true) const |
|
const QualAdjAligner * | get_qual_adj_aligner () const |
|
const Aligner * | get_regular_aligner () const |
|
|
static double | window_breaking_quality (const vector< Minimizer > &minimizers, vector< size_t > &broken, const string &sequence, const string &quality_bytes) |
|
static double | faster_cap (const vector< Minimizer > &minimizers, vector< size_t > &minimizers_explored, const string &sequence, const string &quality_bytes) |
|
static void | for_each_aglomeration_interval (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t > &minimizer_indices, const function< void(size_t, size_t, size_t, size_t)> &iteratee) |
|
static double | get_log10_prob_of_disruption_in_interval (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t left, size_t right) |
|
static double | get_prob_of_disruption_in_column (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t index) |
|
static int | score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty) |
|
static size_t | immutable_path_from_length (const ImmutablePath &path) |
|
static Path | to_path (const ImmutablePath &path) |
|
static void | dump_debug_minimizers (const vector< Minimizer > &minimizers, const string &sequence, const vector< size_t > *to_include=nullptr) |
| Dump all the given minimizers, with optional subset restriction. More...
|
|
static void | dump_debug_extension_set (const HandleGraph &graph, const Alignment &aln, const vector< GaplessExtension > &extended_seeds) |
| Dump all the extansions in an extension set. More...
|
|
static void | dump_debug_sequence (ostream &out, const string &sequence) |
| Print a sequence with base numbering. More...
|
|
double vg::MinimizerMapper::faster_cap |
( |
const vector< Minimizer > & |
minimizers, |
|
|
vector< size_t > & |
minimizers_explored, |
|
|
const string & |
sequence, |
|
|
const string & |
quality_bytes |
|
) |
| |
|
staticprotected |
Compute a bound on the Phred score probability of a mapping beign wrong due to base errors and unlocated minimizer hits prevented us from finding the true alignment.
Algorithm uses a "sweep line" dynamic programming approach. For a read with minimizers aligned to it:
000000000011111111112222222222
012345678901234567890123456789
Read: ****************************** Minimizer 1: ***** Minimizer 2: ***** Minimizer 3: ***** Minimizer 4: *****
For each distinct read interval of overlapping minimizers, e.g. in the example the intervals 3,4,5; 6,7; 8,9,10; 18,19,20; 21,22; and 23,24,25 we consider base errors that would result in the minimizers in the interval being incorrect
We use dynamic programming sweeping left-to-right over the intervals to compute the probability of the minimum number of base errors needed to disrupt all the minimizers.
Will sort minimizers_explored (which is indices into minimizers) by minimizer start position.
unordered_map<size_t, unordered_map<size_t, vector<Path> > > vg::MinimizerMapper::find_connecting_paths |
( |
const vector< GaplessExtension > & |
extended_seeds, |
|
|
size_t |
read_length |
|
) |
| const |
|
protected |
Find for each pair of extended seeds all the haplotype-consistent graph paths against which the intervening read sequence needs to be aligned.
Limits walks from each extended seed end to the longest detectable gap plus the remaining to-be-alinged sequence, both computed using the read length.
extended_seeds must be sorted by read start position. Any extended seeds that overlap in the read will be precluded from connecting.
numeric_limits<size_t>::max() is used to store sufficiently long Paths ending before sources (which cannot be reached from other extended seeds) and starting after sinks (which cannot reach any other extended seeds). Only sources and sinks have these "tail" paths.
Tail paths are only calculated if the MinimizerMapper has linear_tails set to true.
void vg::MinimizerMapper::for_each_aglomeration_interval |
( |
const vector< Minimizer > & |
minimizers, |
|
|
const string & |
sequence, |
|
|
const string & |
quality_bytes, |
|
|
const vector< size_t > & |
minimizer_indices, |
|
|
const function< void(size_t, size_t, size_t, size_t)> & |
iteratee |
|
) |
| |
|
staticprotected |
Given a collection of minimizers, and a list of the minimizers we actually care about (as indices into the collection), iterate over common intervals of overlapping minimizer agglomerations.
Calls the given callback with (left, right, bottom, top), where left is the first base of the agglomeration interval (inclusive), right is the last base of the agglomeration interval (exclusive), bottom is the index of the first minimizer with an agglomeration in the interval and top is the index of the last minimizer with an agglomeration in the interval (exclusive).
Note that bottom and top are offsets into minimizer_indices, NOT minimizers itself. Only contiguous ranges in minimizer_indices actually make sense.
pair< Path, size_t > vg::MinimizerMapper::get_best_alignment_against_any_tree |
( |
const vector< TreeSubgraph > & |
trees, |
|
|
const string & |
sequence, |
|
|
const Position & |
default_position, |
|
|
bool |
pin_left, |
|
|
size_t |
longest_detectable_gap |
|
) |
| const |
|
protected |
Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.
If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.
Alignment is always pinned.
If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.
Limits the length of the longest gap to longest_detectable_gap.
Returns alingments in gbwt_graph space.
vector< TreeSubgraph > vg::MinimizerMapper::get_tail_forest |
( |
const GaplessExtension & |
extended_seed, |
|
|
size_t |
read_length, |
|
|
bool |
left_tails, |
|
|
size_t * |
longest_detectable_gap = nullptr |
|
) |
| const |
|
protected |
Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.
If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.
If the gapless extension abuts the edge of the read, an empty forest will be produced.
Each tree is represented as a TreeSubgraph over our gbwt_graph.
If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.
As a side effect, saves the length of the longest detectable gap in an alignment of a tail to the forest into the provided location, if set.
template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold_a |
( |
const vector< Item > & |
items, |
|
|
const function< Score(size_t)> & |
get_score, |
|
|
double |
threshold, |
|
|
size_t |
min_count, |
|
|
size_t |
max_count, |
|
|
const function< bool(size_t)> & |
process_item, |
|
|
const function< void(size_t)> & |
discard_item_by_count, |
|
|
const function< void(size_t)> & |
discard_item_by_score |
|
) |
| const |
|
protected |
Given a vector of items, a function to get the score of each, a score-difference-from-the-best cutoff, and a min and max processed item count, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).
If process_item returns false, the item is skipped and does not count against min_count or max_count.
Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.
Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.
static double vg::MinimizerMapper::window_breaking_quality |
( |
const vector< Minimizer > & |
minimizers, |
|
|
vector< size_t > & |
broken, |
|
|
const string & |
sequence, |
|
|
const string & |
quality_bytes |
|
) |
| |
|
staticprotected |
Compute a bound on the Phred score probability of having created the agglomerations of the specified minimizers by base errors from the given sequence, which was sequenced with the given qualities.
No limit is imposed if broken is empty.
Takes the collection of all minimizers found, and a vector of the indices of minimizers we are interested in the agglomerations of. May modify the order of that index vector.
Also takes the sequence of the read (to avoid Ns) and the quality string (interpreted as a byte array).
Currently computes a lower-score-bound, upper-probability-bound, suitable for use as a mapping quality cap, by assuming the easiest-to-disrupt possible layout of the windows, and the lowest possible qualities for the disrupting bases.