vg
tools for working with variation graphs
Classes | Public Types | Public Member Functions | Public Attributes | Protected Types | Protected Member Functions | Static Protected Member Functions | Protected Attributes | List of all members
vg::MinimizerMapper Class Reference

#include <minimizer_mapper.hpp>

Inheritance diagram for vg::MinimizerMapper:
vg::AlignerClient

Classes

struct  Minimizer
 

Public Types

enum  RescueAlgorithm { rescue_none, rescue_dozeu, rescue_gssw, rescue_haplotypes }
 Implemented rescue algorithms: no rescue, dozeu, GSSW, dozeu on local haplotypes. More...
 

Public Member Functions

 MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const std::vector< gbwtgraph::DefaultMinimizerIndex * > &minimizer_indexes, MinimumDistanceIndex &distance_index, const PathPositionHandleGraph *path_graph=nullptr)
 
void map (Alignment &aln, AlignmentEmitter &alignment_emitter)
 
vector< Alignmentmap (Alignment &aln)
 
pair< vector< Alignment >, vector< Alignment > > map_paired (Alignment &aln1, Alignment &aln2, vector< pair< Alignment, Alignment >> &ambiguous_pair_buffer)
 
pair< vector< Alignment >, vector< Alignment > > map_paired (Alignment &aln1, Alignment &aln2)
 
bool fragment_distr_is_finalized ()
 
void finalize_fragment_length_distr ()
 
void force_fragment_length_distr (double mean, double stdev)
 
double get_fragment_length_mean () const
 
double get_fragment_length_stdev () const
 
size_t get_fragment_length_sample_size () const
 
size_t get_distance_limit (size_t read_length) const
 
- Public Member Functions inherited from vg::AlignerClient
void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 Set all the aligner scoring parameters and create the stored aligner instances. More...
 
void set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 
void set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus)
 

Public Attributes

size_t hit_cap = 10
 Use all minimizers with at most hit_cap hits. More...
 
size_t hard_hit_cap = 500
 Ignore all minimizers with more than hard_hit_cap hits. More...
 
double minimizer_score_fraction = 0.9
 
size_t min_extensions = 2
 Accept at least this many clusters. More...
 
size_t max_extensions = 800
 How many clusters should we align? More...
 
size_t max_alignments = 8
 How many extended clusters should we align, max? More...
 
size_t max_local_extensions = numeric_limits<size_t>::max()
 How many extensions should we try as seeds within a mapping location? More...
 
double cluster_score_threshold = 50
 
double pad_cluster_score_threshold = 20
 
double cluster_coverage_threshold = 0.3
 
double extension_set_score_threshold = 20
 
int extension_score_threshold = 1
 
size_t max_multimaps = 1
 
size_t distance_limit = 200
 
bool do_dp = true
 
string sample_name
 
string read_group
 
bool track_provenance = false
 
bool track_correctness = false
 
double paired_distance_stdevs = 2.0
 
double paired_rescue_score_limit = 0.9
 How close does an alignment have to be to the best alignment for us to rescue on it. More...
 
double rescue_subgraph_stdevs = 4.0
 How many stdevs from the mean do we extract a subgraph from? More...
 
size_t max_rescue_attempts = 15
 For paired end mapping, how many times should we attempt rescue (per read)? More...
 
RescueAlgorithm rescue_algorithm = rescue_dozeu
 The algorithm used for rescue. More...
 
- Public Attributes inherited from vg::AlignerClient
bool adjust_alignments_for_base_quality = false
 

Protected Types

typedef SnarlSeedClusterer::Seed Seed
 The information we store for each seed. More...
 
typedef SnarlSeedClusterer::Cluster Cluster
 The information we store for each cluster. More...
 
using ImmutablePath = structures::ImmutableList< Mapping >
 

Protected Member Functions

std::vector< Minimizerfind_minimizers (const std::string &sequence, Funnel &funnel) const
 
std::vector< Seedfind_seeds (const std::vector< Minimizer > &minimizers, const Alignment &aln, Funnel &funnel) const
 
void score_cluster (Cluster &cluster, size_t i, const std::vector< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t seq_length, Funnel &funnel) const
 
std::vector< int > score_extensions (const std::vector< std::vector< GaplessExtension >> &extensions, const Alignment &aln, Funnel &funnel) const
 
std::vector< int > score_extensions (const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &extensions, const Alignment &aln, Funnel &funnel) const
 
void attempt_rescue (const Alignment &aligned_read, Alignment &rescued_alignment, const std::vector< Minimizer > &minimizers, bool rescue_forward)
 
GaplessExtender::cluster_type seeds_in_subgraph (const std::vector< Minimizer > &minimizers, const std::unordered_set< id_t > &subgraph) const
 
void fix_dozeu_score (Alignment &rescued_alignment, const HandleGraph &rescue_graph, const std::vector< handle_t > &topological_order) const
 
int64_t distance_between (const Alignment &aln1, const Alignment &aln2)
 
void extension_to_alignment (const GaplessExtension &extension, Alignment &alignment) const
 
double compute_mapq_caps (const Alignment &aln, const std::vector< Minimizer > &minimizers, const SmallBitset &explored)
 
void find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, Alignment &best, Alignment &second_best) const
 
unordered_map< size_t, unordered_map< size_t, vector< Path > > > find_connecting_paths (const vector< GaplessExtension > &extended_seeds, size_t read_length) const
 
vector< TreeSubgraphget_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails, size_t *longest_detectable_gap=nullptr) const
 
pair< Path, size_t > get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left, size_t longest_detectable_gap) const
 
void dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
template<typename Item , typename Score = double>
void process_until_threshold_a (const vector< Item > &items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Item , typename Score = double>
void process_until_threshold_b (const vector< Item > &items, const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Item , typename Score = double>
void process_until_threshold_c (const vector< Item > &items, const function< Score(size_t)> &get_score, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
- Protected Member Functions inherited from vg::AlignerClient
 AlignerClient (double gc_content_estimate=vg::default_gc_content)
 
const GSSWAlignerget_aligner (bool have_qualities=true) const
 
const QualAdjAlignerget_qual_adj_aligner () const
 
const Alignerget_regular_aligner () const
 

Static Protected Member Functions

static double window_breaking_quality (const vector< Minimizer > &minimizers, vector< size_t > &broken, const string &sequence, const string &quality_bytes)
 
static double faster_cap (const vector< Minimizer > &minimizers, vector< size_t > &minimizers_explored, const string &sequence, const string &quality_bytes)
 
static void for_each_aglomeration_interval (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t > &minimizer_indices, const function< void(size_t, size_t, size_t, size_t)> &iteratee)
 
static double get_log10_prob_of_disruption_in_interval (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t left, size_t right)
 
static double get_prob_of_disruption_in_column (const vector< Minimizer > &minimizers, const string &sequence, const string &quality_bytes, const vector< size_t >::iterator &disrupt_begin, const vector< size_t >::iterator &disrupt_end, size_t index)
 
static int score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty)
 
static size_t immutable_path_from_length (const ImmutablePath &path)
 
static Path to_path (const ImmutablePath &path)
 
static void dump_debug_minimizers (const vector< Minimizer > &minimizers, const string &sequence, const vector< size_t > *to_include=nullptr)
 Dump all the given minimizers, with optional subset restriction. More...
 
static void dump_debug_extension_set (const HandleGraph &graph, const Alignment &aln, const vector< GaplessExtension > &extended_seeds)
 Dump all the extansions in an extension set. More...
 
static void dump_debug_sequence (ostream &out, const string &sequence)
 Print a sequence with base numbering. More...
 

Protected Attributes

const PathPositionHandleGraphpath_graph
 
const std::vector< gbwtgraph::DefaultMinimizerIndex * > & minimizer_indexes
 
MinimumDistanceIndexdistance_index
 
const gbwtgraph::GBWTGraph & gbwt_graph
 This is our primary graph. More...
 
GaplessExtender extender
 We have a gapless extender to extend seed hits in haplotype space. More...
 
SnarlSeedClusterer clusterer
 We have a clusterer. More...
 
FragmentLengthDistribution fragment_length_distr
 
atomic_flag warned_about_bad_distribution = ATOMIC_FLAG_INIT
 

Additional Inherited Members

- Static Public Member Functions inherited from vg::AlignerClient
static int8_t * parse_matrix (std::istream &matrix_stream)
 Allocates an array to hold a 4x4 substitution matrix and returns it. More...
 

Member Typedef Documentation

◆ Cluster

The information we store for each cluster.

◆ ImmutablePath

using vg::MinimizerMapper::ImmutablePath = structures::ImmutableList<Mapping>
protected

We define a type for shared-tail lists of Mappings, to avoid constantly copying Path objects.

◆ Seed

The information we store for each seed.

Member Enumeration Documentation

◆ RescueAlgorithm

Implemented rescue algorithms: no rescue, dozeu, GSSW, dozeu on local haplotypes.

Enumerator
rescue_none 
rescue_dozeu 
rescue_gssw 
rescue_haplotypes 

Constructor & Destructor Documentation

◆ MinimizerMapper()

vg::MinimizerMapper::MinimizerMapper ( const gbwtgraph::GBWTGraph &  graph,
const std::vector< gbwtgraph::DefaultMinimizerIndex * > &  minimizer_indexes,
MinimumDistanceIndex distance_index,
const PathPositionHandleGraph path_graph = nullptr 
)

Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, as we only use it for correctness tracking.

Member Function Documentation

◆ attempt_rescue()

void vg::MinimizerMapper::attempt_rescue ( const Alignment aligned_read,
Alignment rescued_alignment,
const std::vector< Minimizer > &  minimizers,
bool  rescue_forward 
)
protected

Given an aligned read, extract a subgraph of the graph within a distance range based on the fragment length distribution and attempt to align the unaligned read to it. Rescue_forward is true if the aligned read is the first and false otherwise. Assumes that both reads are facing the same direction. TODO: This should be const, but some of the function calls are not.

◆ compute_mapq_caps()

double vg::MinimizerMapper::compute_mapq_caps ( const Alignment aln,
const std::vector< Minimizer > &  minimizers,
const SmallBitset explored 
)
protected

Compute MAPQ caps based on all minimizers that are explored, for some definition of explored.

Needs access to the input alignment for sequence and quality information.

Returns only an "extended" cap at the moment.

◆ dfs_gbwt() [1/3]

void vg::MinimizerMapper::dfs_gbwt ( const gbwt::SearchState &  start_state,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a handle and an offset, but takes a gbwt::SearchState that defines only some haplotypes on a handle to start with.

◆ dfs_gbwt() [2/3]

void vg::MinimizerMapper::dfs_gbwt ( const Position from,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

Run a DFS on valid haplotypes in the GBWT starting from the given Position, and continuing up to the given number of bases.

Calls enter_handle when the DFS enters a haplotype visit to a particular handle, and exit_handle when it exits a visit. These let the caller maintain a stack and track the traversals.

The starting node is only entered if its offset isn't equal to its length (i.e. bases remain to be visited).

Stopping early is not permitted.

◆ dfs_gbwt() [3/3]

void vg::MinimizerMapper::dfs_gbwt ( handle_t  from_handle,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a Position, but takes a handle in the backing gbwt_graph and an offset from the start of the handle instead.

◆ distance_between()

int64_t vg::MinimizerMapper::distance_between ( const Alignment aln1,
const Alignment aln2 
)
protected

Get the distance between a pair of read alignments

◆ dump_debug_extension_set()

void vg::MinimizerMapper::dump_debug_extension_set ( const HandleGraph graph,
const Alignment aln,
const vector< GaplessExtension > &  extended_seeds 
)
staticprotected

Dump all the extansions in an extension set.

◆ dump_debug_minimizers()

void vg::MinimizerMapper::dump_debug_minimizers ( const vector< Minimizer > &  minimizers,
const string &  sequence,
const vector< size_t > *  to_include = nullptr 
)
staticprotected

Dump all the given minimizers, with optional subset restriction.

◆ dump_debug_sequence()

void vg::MinimizerMapper::dump_debug_sequence ( ostream &  out,
const string &  sequence 
)
staticprotected

Print a sequence with base numbering.

◆ extension_to_alignment()

void vg::MinimizerMapper::extension_to_alignment ( const GaplessExtension extension,
Alignment alignment 
) const
protected

Convert the GaplessExtension into an alignment. This assumes that the extension is a full-length alignment and that the sequence field of the alignment has been set.

◆ faster_cap()

double vg::MinimizerMapper::faster_cap ( const vector< Minimizer > &  minimizers,
vector< size_t > &  minimizers_explored,
const string &  sequence,
const string &  quality_bytes 
)
staticprotected

Compute a bound on the Phred score probability of a mapping beign wrong due to base errors and unlocated minimizer hits prevented us from finding the true alignment.

Algorithm uses a "sweep line" dynamic programming approach. For a read with minimizers aligned to it:

         000000000011111111112222222222
         012345678901234567890123456789

Read: ****************************** Minimizer 1: ***** Minimizer 2: ***** Minimizer 3: ***** Minimizer 4: *****

For each distinct read interval of overlapping minimizers, e.g. in the example the intervals 3,4,5; 6,7; 8,9,10; 18,19,20; 21,22; and 23,24,25 we consider base errors that would result in the minimizers in the interval being incorrect

We use dynamic programming sweeping left-to-right over the intervals to compute the probability of the minimum number of base errors needed to disrupt all the minimizers.

Will sort minimizers_explored (which is indices into minimizers) by minimizer start position.

◆ finalize_fragment_length_distr()

void vg::MinimizerMapper::finalize_fragment_length_distr ( )
inline

◆ find_connecting_paths()

unordered_map<size_t, unordered_map<size_t, vector<Path> > > vg::MinimizerMapper::find_connecting_paths ( const vector< GaplessExtension > &  extended_seeds,
size_t  read_length 
) const
protected

Find for each pair of extended seeds all the haplotype-consistent graph paths against which the intervening read sequence needs to be aligned.

Limits walks from each extended seed end to the longest detectable gap plus the remaining to-be-alinged sequence, both computed using the read length.

extended_seeds must be sorted by read start position. Any extended seeds that overlap in the read will be precluded from connecting.

numeric_limits<size_t>::max() is used to store sufficiently long Paths ending before sources (which cannot be reached from other extended seeds) and starting after sinks (which cannot reach any other extended seeds). Only sources and sinks have these "tail" paths.

Tail paths are only calculated if the MinimizerMapper has linear_tails set to true.

◆ find_minimizers()

std::vector< MinimizerMapper::Minimizer > vg::MinimizerMapper::find_minimizers ( const std::string &  sequence,
Funnel funnel 
) const
protected

Find the minimizers in the sequence using all minimizer indexes and return them sorted in descending order by score.

◆ find_optimal_tail_alignments()

void vg::MinimizerMapper::find_optimal_tail_alignments ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
Alignment best,
Alignment second_best 
) const
protected

Operating on the given input alignment, align the tails dangling off the given extended perfect-match seeds and produce an optimal alignment into the given output Alignment object, best, and the second best alignment into second_best.

◆ find_seeds()

std::vector< MinimizerMapper::Seed > vg::MinimizerMapper::find_seeds ( const std::vector< Minimizer > &  minimizers,
const Alignment aln,
Funnel funnel 
) const
protected

Find seeds for all minimizers passing the filters.

◆ fix_dozeu_score()

void vg::MinimizerMapper::fix_dozeu_score ( Alignment rescued_alignment,
const HandleGraph rescue_graph,
const std::vector< handle_t > &  topological_order 
) const
protected

When we use dozeu for rescue, the reported alignment score is incorrect. 1) Dozeu only gives the full-length bonus once. 2) There is no penalty for a softclip at the edge of the subgraph. This function calculates the score correctly. If the score is <= 0, we realign the read using GSSW. TODO: This should be unnecessary.

◆ for_each_aglomeration_interval()

void vg::MinimizerMapper::for_each_aglomeration_interval ( const vector< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t > &  minimizer_indices,
const function< void(size_t, size_t, size_t, size_t)> &  iteratee 
)
staticprotected

Given a collection of minimizers, and a list of the minimizers we actually care about (as indices into the collection), iterate over common intervals of overlapping minimizer agglomerations.

Calls the given callback with (left, right, bottom, top), where left is the first base of the agglomeration interval (inclusive), right is the last base of the agglomeration interval (exclusive), bottom is the index of the first minimizer with an agglomeration in the interval and top is the index of the last minimizer with an agglomeration in the interval (exclusive).

Note that bottom and top are offsets into minimizer_indices, NOT minimizers itself. Only contiguous ranges in minimizer_indices actually make sense.

◆ force_fragment_length_distr()

void vg::MinimizerMapper::force_fragment_length_distr ( double  mean,
double  stdev 
)
inline

◆ fragment_distr_is_finalized()

bool vg::MinimizerMapper::fragment_distr_is_finalized ( )
inline

◆ get_best_alignment_against_any_tree()

pair< Path, size_t > vg::MinimizerMapper::get_best_alignment_against_any_tree ( const vector< TreeSubgraph > &  trees,
const string &  sequence,
const Position default_position,
bool  pin_left,
size_t  longest_detectable_gap 
) const
protected

Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.

If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.

Alignment is always pinned.

If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.

Limits the length of the longest gap to longest_detectable_gap.

Returns alingments in gbwt_graph space.

◆ get_distance_limit()

size_t vg::MinimizerMapper::get_distance_limit ( size_t  read_length) const
inline

Get the distance limit for the given read length

◆ get_fragment_length_mean()

double vg::MinimizerMapper::get_fragment_length_mean ( ) const
inline

◆ get_fragment_length_sample_size()

size_t vg::MinimizerMapper::get_fragment_length_sample_size ( ) const
inline

◆ get_fragment_length_stdev()

double vg::MinimizerMapper::get_fragment_length_stdev ( ) const
inline

◆ get_log10_prob_of_disruption_in_interval()

double vg::MinimizerMapper::get_log10_prob_of_disruption_in_interval ( const vector< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t >::iterator &  disrupt_begin,
const vector< size_t >::iterator &  disrupt_end,
size_t  left,
size_t  right 
)
staticprotected

Gives the log10 prob of a base error in the given interval of the read, accounting for the disruption of specified minimizers.

minimizers is the collection of all minimizers

disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.

left and right are the inclusive and exclusive bounds of the interval of the read where the disruption occurs.

◆ get_prob_of_disruption_in_column()

double vg::MinimizerMapper::get_prob_of_disruption_in_column ( const vector< Minimizer > &  minimizers,
const string &  sequence,
const string &  quality_bytes,
const vector< size_t >::iterator &  disrupt_begin,
const vector< size_t >::iterator &  disrupt_end,
size_t  index 
)
staticprotected

Gives the raw probability of a base error in the given column of the read, accounting for the disruption of specified minimizers.

minimizers is the collection of all minimizers

disrupt_begin and disrupt_end are iterators defining a sequence of indices of minimizers in minimizers that are disrupted.

index is the position in the read where the disruption occurs.

◆ get_tail_forest()

vector< TreeSubgraph > vg::MinimizerMapper::get_tail_forest ( const GaplessExtension extended_seed,
size_t  read_length,
bool  left_tails,
size_t *  longest_detectable_gap = nullptr 
) const
protected

Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.

If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.

If the gapless extension abuts the edge of the read, an empty forest will be produced.

Each tree is represented as a TreeSubgraph over our gbwt_graph.

If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.

As a side effect, saves the length of the longest detectable gap in an alignment of a tail to the forest into the provided location, if set.

◆ immutable_path_from_length()

size_t vg::MinimizerMapper::immutable_path_from_length ( const ImmutablePath path)
staticprotected

Get the from length of an ImmutabelPath.

Can't be called path_from_length or it will shadow the one for Paths instead of overloading.

◆ map() [1/2]

vector< Alignment > vg::MinimizerMapper::map ( Alignment aln)

Map the given read. Return a vector of alignments that it maps to, winner first.

◆ map() [2/2]

void vg::MinimizerMapper::map ( Alignment aln,
AlignmentEmitter alignment_emitter 
)

Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. TODO: Can't be const because the clusterer's cluster_seeds isn't const.

◆ map_paired() [1/2]

pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired ( Alignment aln1,
Alignment aln2 
)

Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.

If the fragment length distribution is not yet fixed, reads will be mapped independently. Otherwise, they will be mapped according to the fragment length distribution.

◆ map_paired() [2/2]

pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired ( Alignment aln1,
Alignment aln2,
vector< pair< Alignment, Alignment >> &  ambiguous_pair_buffer 
)

Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.

If the reads are ambiguous and there's no fragment length distribution fixed yet, they will be dropped into ambiguous_pair_buffer.

Otherwise, at least one result will be returned for them (although it may be the unmapped alignment).

◆ process_until_threshold_a()

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold_a ( const vector< Item > &  items,
const function< Score(size_t)> &  get_score,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Given a vector of items, a function to get the score of each, a score-difference-from-the-best cutoff, and a min and max processed item count, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).

If process_item returns false, the item is skipped and does not count against min_count or max_count.

Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.

Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.

◆ process_until_threshold_b()

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold_b ( const vector< Item > &  items,
const vector< Score > &  scores,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold functions, except using a vector to supply scores.

◆ process_until_threshold_c()

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold_c ( const vector< Item > &  items,
const function< Score(size_t)> &  get_score,
const function< bool(size_t, size_t)> &  comparator,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold functions, except user supplies comparator to sort the items (must still be sorted by score).

◆ score_cluster()

void vg::MinimizerMapper::score_cluster ( Cluster cluster,
size_t  i,
const std::vector< Minimizer > &  minimizers,
const std::vector< Seed > &  seeds,
size_t  seq_length,
Funnel funnel 
) const
protected

Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.

◆ score_extension_group()

int vg::MinimizerMapper::score_extension_group ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
int  gap_open_penalty,
int  gap_extend_penalty 
)
staticprotected

Score the given group of gapless extensions. Determines the best score that can be obtained by chaining extensions together, using the given gap open and gap extend penalties to charge for either overlaps or gaps in coverage of the read.

Enforces that overlaps cannot result in containment.

Input extended seeds must be sorted by start position.

◆ score_extensions() [1/2]

std::vector< int > vg::MinimizerMapper::score_extensions ( const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &  extensions,
const Alignment aln,
Funnel funnel 
) const
protected

Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.

◆ score_extensions() [2/2]

std::vector< int > vg::MinimizerMapper::score_extensions ( const std::vector< std::vector< GaplessExtension >> &  extensions,
const Alignment aln,
Funnel funnel 
) const
protected

Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.

◆ seeds_in_subgraph()

GaplessExtender::cluster_type vg::MinimizerMapper::seeds_in_subgraph ( const std::vector< Minimizer > &  minimizers,
const std::unordered_set< id_t > &  subgraph 
) const
protected

Return the all non-redundant seeds in the subgraph, including those from minimizers not used for mapping.

◆ to_path()

Path vg::MinimizerMapper::to_path ( const ImmutablePath path)
staticprotected

Convert an ImmutablePath to a Path.

◆ window_breaking_quality()

static double vg::MinimizerMapper::window_breaking_quality ( const vector< Minimizer > &  minimizers,
vector< size_t > &  broken,
const string &  sequence,
const string &  quality_bytes 
)
staticprotected

Compute a bound on the Phred score probability of having created the agglomerations of the specified minimizers by base errors from the given sequence, which was sequenced with the given qualities.

No limit is imposed if broken is empty.

Takes the collection of all minimizers found, and a vector of the indices of minimizers we are interested in the agglomerations of. May modify the order of that index vector.

Also takes the sequence of the read (to avoid Ns) and the quality string (interpreted as a byte array).

Currently computes a lower-score-bound, upper-probability-bound, suitable for use as a mapping quality cap, by assuming the easiest-to-disrupt possible layout of the windows, and the lowest possible qualities for the disrupting bases.

Member Data Documentation

◆ cluster_coverage_threshold

double vg::MinimizerMapper::cluster_coverage_threshold = 0.3

◆ cluster_score_threshold

double vg::MinimizerMapper::cluster_score_threshold = 50

◆ clusterer

SnarlSeedClusterer vg::MinimizerMapper::clusterer
protected

We have a clusterer.

◆ distance_index

MinimumDistanceIndex& vg::MinimizerMapper::distance_index
protected

◆ distance_limit

size_t vg::MinimizerMapper::distance_limit = 200

◆ do_dp

bool vg::MinimizerMapper::do_dp = true

◆ extender

GaplessExtender vg::MinimizerMapper::extender
protected

We have a gapless extender to extend seed hits in haplotype space.

◆ extension_score_threshold

int vg::MinimizerMapper::extension_score_threshold = 1

◆ extension_set_score_threshold

double vg::MinimizerMapper::extension_set_score_threshold = 20

◆ fragment_length_distr

FragmentLengthDistribution vg::MinimizerMapper::fragment_length_distr
protected

◆ gbwt_graph

const gbwtgraph::GBWTGraph& vg::MinimizerMapper::gbwt_graph
protected

This is our primary graph.

◆ hard_hit_cap

size_t vg::MinimizerMapper::hard_hit_cap = 500

Ignore all minimizers with more than hard_hit_cap hits.

◆ hit_cap

size_t vg::MinimizerMapper::hit_cap = 10

Use all minimizers with at most hit_cap hits.

◆ max_alignments

size_t vg::MinimizerMapper::max_alignments = 8

How many extended clusters should we align, max?

◆ max_extensions

size_t vg::MinimizerMapper::max_extensions = 800

How many clusters should we align?

◆ max_local_extensions

size_t vg::MinimizerMapper::max_local_extensions = numeric_limits<size_t>::max()

How many extensions should we try as seeds within a mapping location?

◆ max_multimaps

size_t vg::MinimizerMapper::max_multimaps = 1

◆ max_rescue_attempts

size_t vg::MinimizerMapper::max_rescue_attempts = 15

For paired end mapping, how many times should we attempt rescue (per read)?

◆ min_extensions

size_t vg::MinimizerMapper::min_extensions = 2

Accept at least this many clusters.

◆ minimizer_indexes

const std::vector<gbwtgraph::DefaultMinimizerIndex*>& vg::MinimizerMapper::minimizer_indexes
protected

◆ minimizer_score_fraction

double vg::MinimizerMapper::minimizer_score_fraction = 0.9

Take minimizers between hit_cap and hard_hit_cap hits until this fraction of total score

◆ pad_cluster_score_threshold

double vg::MinimizerMapper::pad_cluster_score_threshold = 20

◆ paired_distance_stdevs

double vg::MinimizerMapper::paired_distance_stdevs = 2.0

◆ paired_rescue_score_limit

double vg::MinimizerMapper::paired_rescue_score_limit = 0.9

How close does an alignment have to be to the best alignment for us to rescue on it.

◆ path_graph

const PathPositionHandleGraph* vg::MinimizerMapper::path_graph
protected

◆ read_group

string vg::MinimizerMapper::read_group

◆ rescue_algorithm

RescueAlgorithm vg::MinimizerMapper::rescue_algorithm = rescue_dozeu

The algorithm used for rescue.

◆ rescue_subgraph_stdevs

double vg::MinimizerMapper::rescue_subgraph_stdevs = 4.0

How many stdevs from the mean do we extract a subgraph from?

◆ sample_name

string vg::MinimizerMapper::sample_name

◆ track_correctness

bool vg::MinimizerMapper::track_correctness = false

Guess which seed hits are correct by location in the linear reference and track if/when their descendants make it through stages of the algorithm. Only works if track_provenance is true.

◆ track_provenance

bool vg::MinimizerMapper::track_provenance = false

Track which internal work items came from which others during each stage of the mapping algorithm.

◆ warned_about_bad_distribution

atomic_flag vg::MinimizerMapper::warned_about_bad_distribution = ATOMIC_FLAG_INIT
protected

The documentation for this class was generated from the following files: