NEFFy
NEFF Calculator and MSA File Converter
Loading...
Searching...
No Matches
neff.cpp File Reference

Neff Computation. More...

#include "flagHandler.h"
#include "msaReader.h"
#include "msaWriter.h"
#include "multimerHandler.h"
#include <iostream>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>
#include <sstream>
#include <climits>
#include "common.h"
#include <fstream>
#include <algorithm>
#include <tuple>
#include <random>
#include <set>
#include <thread>
#include <future>
#include <mutex>
#include <chrono>

Functions

int char2num (char c, const string &standardLetters, const string &nonStandardLetters, NonStandardHandler nonStandardOption)
 Map char residues to digit based on given 'nonStandardOption'.
 
void removeGappyPositions (vector< vector< int > > &sequences, float gapCutoff)
 Remove gappy positions from sequences based on given 'gapCutoff'.
 
vector< vector< int > > processSequences (vector< Sequence > sequences, string standardLetters, string nonStandardLetters, NonStandardHandler nonStandardOption, float gapCutoff)
 Map chars to digits based on provided 'nonStandardOption' and also remove gappy positions based on given 'gapCutoff'.
 
vector< int > computeWeights (vector< vector< int > > sequences, float threshold, bool isSymmetric, string standardLetters, NonStandardHandler nonStandardOption)
 Compute sequence weights based on given options.
 
float computeNeff (vector< int > sequenceWeights, Normalization norm, int length)
 Cumpote NEFF values based on sequence weights and given normalization.
 
Alphabet getAlphabet (FlagHandler &flagHandler)
 Get given alphabet by user.
 
Normalization getNormalization (FlagHandler &flagHandler)
 Get given normalization option by user.
 
NonStandardHandler getNonStandardOption (FlagHandler &flagHandler)
 Get given non_standard_option option by user.
 
void checkFlags (FlagHandler &flagHandler)
 Check flags

 
void setDepth (vector< Sequence > &sequences, int depth)
 Set MSA depth based on 'depth' flag.
 
int getNonGapStartPosition (string firstAlignement, int startPos)
 Get the index of the given startPos in the original first sequence of MSA despite of gaps in the MSA.
 
int getNonGapEndPosition (string firstAlignement, int startPos, int length)
 Get the end index in the original sequence after a 'length' number of non-gap positions.
 
void getPositions (vector< Sequence > &sequences, FlagHandler flagHandler)
 Set desired positiones to compute NEFF for based on given 'pos_start' and 'pos_end' flags.
 
void integrateUniqueSequences (vector< Sequence > &integratedSequences, const vector< Sequence > &sequences)
 to merge sequences and remove redundant sequences
 
vector< float > computeResidueNEFF (const vector< vector< int > > &sequences, const vector< int > &sequenceWeights, Normalization norm)
 Compute per-residue (column-wise) NEFF.
 
int main (int argc, char **argv)
 

Variables

const char * docstr
 
unordered_map< string, FlagInfoFlags
 

Detailed Description

Neff Computation.

This program computes Number of EFFective sequences (NEFF) for a multiple sequence alignment (MSA) file. NEFF is a measure of effective sequence number that takes into account the redundancy and similarity of sequences in the MSA. It is commonly used in bioinformatics to assess the diversity of a set of sequences.

Usage: ./neff –file=<input_file> [options]

Options: –file=<input_file> Input files (comma-separated, no spaces) containing multiple sequence alignments (required)
" --alphabet=<value> Valid alphabet of MSA; alphabet option (0: Protein, 1: RNA, 2: DNA) (default: 0)\n" –check_validation=<true/false> Perform validation on sequences (default: false)
" --threshold=<value> Threshold value of considering two sequences similar (default: 0.8)\n" –norm=

NEFF normalization option (0: sqrt(Length of alignment), 1: Length of alignment, 2: No normalization) (default: 0)
" --omit_query_gaps=<true/false> Omit gap positions of query sequence from all sequences for NEFF computation (default: true)\n" –is_symmetric=<true/false> Consider gaps in similarity cutoff computation (asymmetric) or not (symmetric) (default: true)
" --non_standard_option=<value> Handling non-standard letters in the given alphabet (0: AsStandard, 1: ConsiderGapInCutoff, 2: ConsiderGap)\n" –depth=

Depth of MSA to be cosidered in computation (default: depth of given MSA)
" --gap_cutoff=<value> Cutoff value for removing gappy positions, when #gaps in position >= gap_cutoff (default=1 : does not remove anything)\n" –pos_start=

Start position of each sequence to be considered in neff (inclusive (default: 1))
" --pos_end=<value> Last position of each sequence to be considered in neff (inclusive (default: length of MSA sequence))\n" –only_weights=<true/false> Return only sequence weights, as # similar sequence, rather than the final NEFF (default: false)
" --multimer_MSA=<true/false> Compute NEFF for a multimer MSA (default: false)\n" –stoichiom=

Multimer stoichiometry (default: empty) –chain_length=

of the chains in heteromer multimer (default: 0)
" –residue_neff=<true/false> Compute per-resiue (column-wise) NEFF (default: false)

For detailed instructions, please refer to the documentation at https://maryam-haghani.github.io/NEFFy.

Function Documentation

◆ char2num()

int char2num ( char c,
const string & standardLetters,
const string & nonStandardLetters,
NonStandardHandler nonStandardOption )

Map char residues to digit based on given 'nonStandardOption'.

Parameters
cinput letter
standardLetters
nonStandardLetters
nonStandardOption
Returns

◆ checkFlags()

void checkFlags ( FlagHandler & flagHandler)

Check flags

Parameters
flagHandler

◆ computeNeff()

float computeNeff ( vector< int > sequenceWeights,
Normalization norm,
int length )

Cumpote NEFF values based on sequence weights and given normalization.

Parameters
sequenceWeights
norm
sequenceLength
Returns

◆ computeResidueNEFF()

vector< float > computeResidueNEFF ( const vector< vector< int > > & sequences,
const vector< int > & sequenceWeights,
Normalization norm )

Compute per-residue (column-wise) NEFF.

Parameters
sequences
sequenceWeights
norm
Returns

◆ computeWeights()

vector< int > computeWeights ( vector< vector< int > > sequences,
float threshold,
bool isSymmetric,
string standardLetters,
NonStandardHandler nonStandardOption )

Compute sequence weights based on given options.

Parameters
sequences
threshold
norm
isSymmetric
nonStandardOption
Returns
inverse of sequence weights

◆ getAlphabet()

Alphabet getAlphabet ( FlagHandler & flagHandler)

Get given alphabet by user.

Parameters
flagHandler
Returns

◆ getNonGapEndPosition()

int getNonGapEndPosition ( string firstAlignement,
int startPos,
int length )

Get the end index in the original sequence after a 'length' number of non-gap positions.

Parameters
firstAlignement
startPos
length
Returns
The end index in the MSA sequence including gaps.

◆ getNonGapStartPosition()

int getNonGapStartPosition ( string firstAlignement,
int startPos )

Get the index of the given startPos in the original first sequence of MSA despite of gaps in the MSA.

Parameters
firstAlignement
startPos
Returns
startPos + number of gaps until that position for the first sequence in the MSA

◆ getNonStandardOption()

NonStandardHandler getNonStandardOption ( FlagHandler & flagHandler)

Get given non_standard_option option by user.

Parameters
flagHandler
Returns

◆ getNormalization()

Normalization getNormalization ( FlagHandler & flagHandler)

Get given normalization option by user.

Parameters
flagHandler
Returns

◆ getPositions()

void getPositions ( vector< Sequence > & sequences,
FlagHandler flagHandler )

Set desired positiones to compute NEFF for based on given 'pos_start' and 'pos_end' flags.

Parameters
sequences
flagHandler

◆ integrateUniqueSequences()

void integrateUniqueSequences ( vector< Sequence > & integratedSequences,
const vector< Sequence > & sequences )

to merge sequences and remove redundant sequences

Parameters
integratedSequences
sequences

◆ main()

int main ( int argc,
char ** argv )

omit gap positions of query sequence in all sequences if omitGapsInQuery=true

◆ processSequences()

vector< vector< int > > processSequences ( vector< Sequence > sequences,
string standardLetters,
string nonStandardLetters,
NonStandardHandler nonStandardOption,
float gapCutoff )

Map chars to digits based on provided 'nonStandardOption' and also remove gappy positions based on given 'gapCutoff'.

Parameters
omitGapsInQuery
alphabet
nonStandardOption
gapCutoff
Returns

◆ removeGappyPositions()

void removeGappyPositions ( vector< vector< int > > & sequences,
float gapCutoff )

Remove gappy positions from sequences based on given 'gapCutoff'.

Parameters
sequences
gapCutoff

◆ setDepth()

void setDepth ( vector< Sequence > & sequences,
int depth )

Set MSA depth based on 'depth' flag.

Parameters
sequences
flagHandler

Variable Documentation

◆ docstr

const char* docstr
Initial value:
= R"(
This program computes the Number of Effective Sequences (NEFF) for a multiple sequence alignment (MSA) file.
NEFF is a measure of the effective sequence number that accounts for the redundancy and similarity of sequences in the MSA.
Usage:
./neff --file=<input_file> [options]
For detailed instructions, please refer to the documentation at https://maryam-haghani.github.io/NEFFy.
)"

◆ Flags

unordered_map<string, FlagInfo> Flags
Initial value:
=
{
{"file", {true, ""}},
{"alphabet", {false, "0"}},
{"check_validation", {false, "false"}},
{"threshold", {false, "0.8"}},
{"norm", {false, "0"}},
{"omit_query_gaps", {false, "true"}},
{"is_symmetric", {false, "true"}},
{"non_standard_option", {false, "0"}},
{"depth", {false, "inf"}},
{"gap_cutoff", {false, "1"}},
{"pos_start", {false, "1"}},
{"pos_end", {false, "inf"}},
{"only_weights", {false, "false"}},
{"multimer_MSA", {false, "false"}},
{"stoichiom", {false, ""}},
{"chain_length", {false, "0"}},
{"residue_neff", {false, "false"}}
}
Footer