vg
tools for working with variation graphs
Public Member Functions | Public Attributes | Protected Types | Protected Member Functions | Static Protected Member Functions | Protected Attributes | List of all members
vg::MinimizerMapper Class Reference

#include <minimizer_mapper.hpp>

Inheritance diagram for vg::MinimizerMapper:
vg::AlignerClient

Public Member Functions

 MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const gbwtgraph::DefaultMinimizerIndex &minimizer_index, MinimumDistanceIndex &distance_index, const PathPositionHandleGraph *path_graph=nullptr)
 
void map (Alignment &aln, AlignmentEmitter &alignment_emitter)
 
- Public Member Functions inherited from vg::AlignerClient
void set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus, uint32_t xdrop_max_gap_length=default_xdrop_max_gap_length)
 Set all the aligner scoring parameters and create the stored aligner instances. More...
 
void load_scoring_matrix (std::ifstream &matrix_stream)
 Load a scoring amtrix from a file to set scores. More...
 

Public Attributes

size_t hit_cap = 10
 Use all minimizers with at most hit_cap hits. More...
 
size_t hard_hit_cap = 300
 Ignore all minimizers with more than hard_hit_cap hits. More...
 
double minimizer_score_fraction = 0.6
 
size_t max_extensions = 48
 How many clusters should we align? More...
 
size_t max_alignments = 8
 How many extended clusters should we align, max? More...
 
size_t max_local_extensions = numeric_limits<size_t>::max()
 How many extensions should we try as seeds within a mapping location? More...
 
double cluster_score_threshold = 0
 
double cluster_coverage_threshold = 0
 
double extension_set_score_threshold = 0
 
int extension_score_threshold = 1
 
size_t max_multimaps = 1
 
size_t distance_limit = 1000
 
bool do_dp = true
 
string sample_name
 
string read_group
 
bool track_provenance = false
 
bool track_correctness = false
 
- Public Attributes inherited from vg::AlignerClient
bool adjust_alignments_for_base_quality = false
 

Protected Types

using ImmutablePath = structures::ImmutableList< Mapping >
 

Protected Member Functions

void find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, Alignment &best, Alignment &second_best) const
 
unordered_map< size_t, unordered_map< size_t, vector< Path > > > find_connecting_paths (const vector< GaplessExtension > &extended_seeds, size_t read_length) const
 
vector< TreeSubgraphget_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails) const
 
pair< Path, size_t > get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left) const
 
void dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
void dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const
 
template<typename Item , typename Score = double>
void process_until_threshold (const vector< Item > &items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Item , typename Score = double>
void process_until_threshold (const vector< Item > &items, const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
template<typename Item , typename Score = double>
void process_until_threshold (const vector< Item > &items, const vector< Score > &scores, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const
 
- Protected Member Functions inherited from vg::AlignerClient
 AlignerClient (double gc_content_estimate=vg::default_gc_content)
 
const GSSWAlignerget_aligner (bool have_qualities=true) const
 
const QualAdjAlignerget_qual_adj_aligner () const
 
const Alignerget_regular_aligner () const
 

Static Protected Member Functions

static int score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty)
 
static size_t immutable_path_from_length (const ImmutablePath &path)
 
static Path to_path (const ImmutablePath &path)
 

Protected Attributes

const PathPositionHandleGraphpath_graph
 
const gbwtgraph::DefaultMinimizerIndex & minimizer_index
 
MinimumDistanceIndexdistance_index
 
const gbwtgraph::GBWTGraph & gbwt_graph
 This is our primary graph. More...
 
GaplessExtender extender
 We have a gapless extender to extend seed hits in haplotype space. More...
 
SnarlSeedClusterer clusterer
 We have a clusterer. More...
 

Member Typedef Documentation

◆ ImmutablePath

using vg::MinimizerMapper::ImmutablePath = structures::ImmutableList<Mapping>
protected

We define a type for shared-tail lists of Mappings, to avoid constantly copying Path objects.

Constructor & Destructor Documentation

◆ MinimizerMapper()

vg::MinimizerMapper::MinimizerMapper ( const gbwtgraph::GBWTGraph &  graph,
const gbwtgraph::DefaultMinimizerIndex &  minimizer_index,
MinimumDistanceIndex distance_index,
const PathPositionHandleGraph path_graph = nullptr 
)

Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, as we only use it for correctness tracking.

Member Function Documentation

◆ dfs_gbwt() [1/3]

void vg::MinimizerMapper::dfs_gbwt ( const gbwt::SearchState &  start_state,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a handle and an offset, but takes a gbwt::SearchState that defines only some haplotypes on a handle to start with.

◆ dfs_gbwt() [2/3]

void vg::MinimizerMapper::dfs_gbwt ( const Position from,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

Run a DFS on valid haplotypes in the GBWT starting from the given Position, and continuing up to the given number of bases.

Calls enter_handle when the DFS enters a haplotype visit to a particular handle, and exit_handle when it exits a visit. These let the caller maintain a stack and track the traversals.

The starting node is only entered if its offset isn't equal to its length (i.e. bases remain to be visited).

Stopping early is not permitted.

◆ dfs_gbwt() [3/3]

void vg::MinimizerMapper::dfs_gbwt ( handle_t  from_handle,
size_t  from_offset,
size_t  walk_distance,
const function< void(const handle_t &)> &  enter_handle,
const function< void(void)>  exit_handle 
) const
protected

The same as dfs_gbwt on a Position, but takes a handle in the backing gbwt_graph and an offset from the start of the handle instead.

◆ find_connecting_paths()

unordered_map<size_t, unordered_map<size_t, vector<Path> > > vg::MinimizerMapper::find_connecting_paths ( const vector< GaplessExtension > &  extended_seeds,
size_t  read_length 
) const
protected

Find for each pair of extended seeds all the haplotype-consistent graph paths against which the intervening read sequence needs to be aligned.

Limits walks from each extended seed end to the longest detectable gap plus the remaining to-be-alinged sequence, both computed using the read length.

extended_seeds must be sorted by read start position. Any extended seeds that overlap in the read will be precluded from connecting.

numeric_limits<size_t>::max() is used to store sufficiently long Paths ending before sources (which cannot be reached from other extended seeds) and starting after sinks (which cannot reach any other extended seeds). Only sources and sinks have these "tail" paths.

Tail paths are only calculated if the MinimizerMapper has linear_tails set to true.

◆ find_optimal_tail_alignments()

void vg::MinimizerMapper::find_optimal_tail_alignments ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
Alignment best,
Alignment second_best 
) const
protected

Operating on the given input alignment, align the tails dangling off the given extended perfect-match seeds and produce an optimal alignment into the given output Alignment object, best, and the second best alignment into second_best.

◆ get_best_alignment_against_any_tree()

pair< Path, size_t > vg::MinimizerMapper::get_best_alignment_against_any_tree ( const vector< TreeSubgraph > &  trees,
const string &  sequence,
const Position default_position,
bool  pin_left 
) const
protected

Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.

If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.

Alignment is always pinned.

If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.

Returns alingments in gbwt_graph space.

◆ get_tail_forest()

vector< TreeSubgraph > vg::MinimizerMapper::get_tail_forest ( const GaplessExtension extended_seed,
size_t  read_length,
bool  left_tails 
) const
protected

Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.

If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.

If the gapless extension abuts the edge of the read, an empty forest will be produced.

Each tree is represented as a TreeSubgraph over our gbwt_graph.

If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.

◆ immutable_path_from_length()

size_t vg::MinimizerMapper::immutable_path_from_length ( const ImmutablePath path)
staticprotected

Get the from length of an ImmutabelPath.

Can't be called path_from_length or it will shadow the one for Paths instead of overloading.

◆ map()

void vg::MinimizerMapper::map ( Alignment aln,
AlignmentEmitter alignment_emitter 
)

Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. TODO: Can't be const because the clusterer's cluster_seeds isn't const.

◆ process_until_threshold() [1/3]

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold ( const vector< Item > &  items,
const function< Score(size_t)> &  get_score,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Given a vector of items, a function to get the score of each, a score-difference-from-the-best cutoff, and a min and max processed item count, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).

If process_item returns false, the item is skipped and does not count against min_count or max_count.

Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.

Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.

◆ process_until_threshold() [2/3]

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold ( const vector< Item > &  items,
const vector< Score > &  scores,
const function< bool(size_t, size_t)> &  comparator,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold overload, except user supplies comparator to sort the items (must still be sorted by score).

◆ process_until_threshold() [3/3]

template<typename Item , typename Score >
void vg::MinimizerMapper::process_until_threshold ( const vector< Item > &  items,
const vector< Score > &  scores,
double  threshold,
size_t  min_count,
size_t  max_count,
const function< bool(size_t)> &  process_item,
const function< void(size_t)> &  discard_item_by_count,
const function< void(size_t)> &  discard_item_by_score 
) const
protected

Same as the other process_until_threshold overload, except using a vector to supply scores.

◆ score_extension_group()

int vg::MinimizerMapper::score_extension_group ( const Alignment aln,
const vector< GaplessExtension > &  extended_seeds,
int  gap_open_penalty,
int  gap_extend_penalty 
)
staticprotected

Score the given group of gapless extensions. Determines the best score that can be obtained by chaining extensions together, using the given gap open and gap extend penalties to charge for either overlaps or gaps in coverage of the read.

Enforces that overlaps cannot result in containment.

Input extended seeds must be sorted by start position.

◆ to_path()

Path vg::MinimizerMapper::to_path ( const ImmutablePath path)
staticprotected

Convert an ImmutablePath to a Path.

Member Data Documentation

◆ cluster_coverage_threshold

double vg::MinimizerMapper::cluster_coverage_threshold = 0

◆ cluster_score_threshold

double vg::MinimizerMapper::cluster_score_threshold = 0

◆ clusterer

SnarlSeedClusterer vg::MinimizerMapper::clusterer
protected

We have a clusterer.

◆ distance_index

MinimumDistanceIndex& vg::MinimizerMapper::distance_index
protected

◆ distance_limit

size_t vg::MinimizerMapper::distance_limit = 1000

◆ do_dp

bool vg::MinimizerMapper::do_dp = true

◆ extender

GaplessExtender vg::MinimizerMapper::extender
protected

We have a gapless extender to extend seed hits in haplotype space.

◆ extension_score_threshold

int vg::MinimizerMapper::extension_score_threshold = 1

◆ extension_set_score_threshold

double vg::MinimizerMapper::extension_set_score_threshold = 0

◆ gbwt_graph

const gbwtgraph::GBWTGraph& vg::MinimizerMapper::gbwt_graph
protected

This is our primary graph.

◆ hard_hit_cap

size_t vg::MinimizerMapper::hard_hit_cap = 300

Ignore all minimizers with more than hard_hit_cap hits.

◆ hit_cap

size_t vg::MinimizerMapper::hit_cap = 10

Use all minimizers with at most hit_cap hits.

◆ max_alignments

size_t vg::MinimizerMapper::max_alignments = 8

How many extended clusters should we align, max?

◆ max_extensions

size_t vg::MinimizerMapper::max_extensions = 48

How many clusters should we align?

◆ max_local_extensions

size_t vg::MinimizerMapper::max_local_extensions = numeric_limits<size_t>::max()

How many extensions should we try as seeds within a mapping location?

◆ max_multimaps

size_t vg::MinimizerMapper::max_multimaps = 1

◆ minimizer_index

const gbwtgraph::DefaultMinimizerIndex& vg::MinimizerMapper::minimizer_index
protected

◆ minimizer_score_fraction

double vg::MinimizerMapper::minimizer_score_fraction = 0.6

Take minimizers between hit_cap and hard_hit_cap hits until this fraction of total score

◆ path_graph

const PathPositionHandleGraph* vg::MinimizerMapper::path_graph
protected

◆ read_group

string vg::MinimizerMapper::read_group

◆ sample_name

string vg::MinimizerMapper::sample_name

◆ track_correctness

bool vg::MinimizerMapper::track_correctness = false

Guess which seed hits are correct by location in the linear reference and track if/when their descendants make it through stages of the algorithm. Only works if track_provenance is true.

◆ track_provenance

bool vg::MinimizerMapper::track_provenance = false

Track which internal work items came from which others during each stage of the mapping algorithm.


The documentation for this class was generated from the following files: