Skip to content

Commit

Permalink
Merge pull request #304 from AdityaSavara/tab-separated-files
Browse files Browse the repository at this point in the history
Changing to tab separated reference files
  • Loading branch information
AdityaSavara authored Jul 3, 2022
2 parents de79c2d + eb85858 commit 8461fd6
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 166 deletions.
60 changes: 0 additions & 60 deletions AcetaldehydeNISTRefMixed2.csv

This file was deleted.

Binary file added AcetaldehydeNISTRefMixed2.tsv
Binary file not shown.
2 changes: 1 addition & 1 deletion DefaultUserInput.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

#//Input Files//
UserChoices['inputFiles'] = {} #initialize the inputFiles container
UserChoices['inputFiles']['referenceFileNamesList'] = ['AcetaldehydeNISTRefMixed2.csv'] #enter the file name of the file containing reference information
UserChoices['inputFiles']['referenceFileNamesList'] = ['AcetaldehydeNISTRefMixed2.tsv'] #enter the file name of the file containing reference information. tsv is tab-separated, csv is comma separated. tsv supports commas in molecule names.
UserChoices['inputFiles']['referenceFormsList'] = 'xyyy' #form is either 'xyyy' or 'xyxy' (if using reference pattern time chooser enter as list with forms for each individual reference file ['xyyy','xyyy','xyyy'])
UserChoices['inputFiles']['referencePatternTimeRanges'] = [] #Leave empty if not using reference pattern time chooser []
UserChoices['inputFiles']['collectedFileName'] = '2-CrotAcetExp#2.csv' #enter the file name with raw mass spectrometer data
Expand Down
87 changes: 66 additions & 21 deletions MSRESOLVE.py
Original file line number Diff line number Diff line change
Expand Up @@ -2545,7 +2545,7 @@ def IterativePrepareNextIterationInputFiles(ExperimentDataFullCopy):

#Now going to overwrite parallelized variables with their original versions if they were set to length of chosen molecules.
delimitedStringOfVariablesToUnparallelize = 'moleculeLikelihoods, sensitivityValues, referenceValueThreshold, referenceSignificantFragmentThresholds'
listOfVariablesToUnparallelize = delimitedStringOfVariablesToUnparallelize.split(", ") #Note that we are using ", " as the delimeter, not just ","
listOfVariablesToUnparallelize = delimitedStringOfVariablesToUnparallelize.split(", ") #Note that we are using ", " as the delimiter, not just ","
for variable in listOfVariablesToUnparallelize:
G.nextUserInputModule.__dict__[variable]=G.beforeParsedGDict[variable]

Expand Down Expand Up @@ -2603,6 +2603,22 @@ def IterativeAnalysisPostProcessing(ExperimentData, simulateddata, mass_fragment
#These functions read in the experimental data file and the reference file. The
#returned variables can then be used to initialize the respective classes.

#a small helper function to check if an extension exists in a filename and to return the delimiter based on that.
def getDelimiterFromExtension(filename):
if ".tsv" in filename:
delimiter = '\t'
elif ".tab" in filename:
delimiter = '\t'
elif ".txt" in filename:
delimiter = '\t'
elif ".skv" in filename:
delimiter = ';'
elif ".csv" in filename:
delimiter = ',' #it could be something else, but we will assume that a csv
else:
delimiter = '\t' #for MSRESOLVE, this is now the default delimiter.
return delimiter

def readDataFile(collectedFileName):

#read the csv file into a dataframe. dataFrame means "dataframe" and is a pandas object.
Expand Down Expand Up @@ -2703,8 +2719,14 @@ def FromXYXYtoXYYY(provided_reference_patterns):
provided_reference_patterns = reference_holder
return provided_reference_patterns

#read the csv file into a dataframe
dataFrame = pandas.read_csv('%s' %referenceFileName, header = None)
#read the csv file into a dataframe
if '.csv' in referenceFileName:
dataFrame = pandas.read_csv('%s' %referenceFileName, header = None)
elif '.tsv' in referenceFileName:
try: #no easy way to assess utf16 vs utf8, so try both.
dataFrame = pandas.read_csv('%s' %referenceFileName, header = None, delimiter = '\t', encoding = 'utf8') #need to specify encoding for cases of tab delimited files.
except: #no easy way to assess utf16 vs utf8, so try both.
dataFrame = pandas.read_csv('%s' %referenceFileName, header = None, delimiter = '\t', encoding = 'utf16') #need to use utf16 for some cases of tab delimited files.

if form == 'xyyy':
for rowIndex in range(len(dataFrame)): #Loop through each row and check the abscissa value
Expand All @@ -2714,20 +2736,20 @@ def FromXYXYtoXYYY(provided_reference_patterns):
reference = dfreference.values #convert to matrix
provided_reference_patterns = reference.astype(float) #convert the matrix to floats
provided_reference_patterns = DataFunctions.removeColumnsWithAllvaluesBelowZeroOrThreshold(provided_reference_patterns,startingRowIndex=1) #clear row of zeros
break #exit the for loop
break #exit the for loop since the first non-header row has been reached.
except: #Otherwise the row consists of other information
if (dataFrame.iloc[rowIndex][0] == 'SourceOfFragmentationPatterns') or (dataFrame.iloc[rowIndex][0] == 'Source:'): #if the abscissa titles the source (both old and new reference files)
dfSourceOfFragmentationPatterns = dataFrame.iloc[rowIndex][1:] #select the row of names
SourceOfFragmentationPatterns = dfSourceOfFragmentationPatterns.values #convert to matrix
SourceOfFragmentationPatterns = SourceOfFragmentationPatterns.astype(numpy.str) #save as class object with type string
SourceOfFragmentationPatterns = SourceOfFragmentationPatterns.astype(str) #save as class object with type string
elif dataFrame.iloc[rowIndex][0] == 'sourceOfIonizationData':
dfsourceOfIonizationData = dataFrame.iloc[rowIndex][1:] #Select the row of names
sourceOfIonizationData = dfsourceOfIonizationData.values #convert to matrix
sourceOfIonizationData = sourceOfIonizationData.astype(numpy.str) #save as class object with type string
sourceOfIonizationData = sourceOfIonizationData.astype(str) #save as class object with type string
elif dataFrame.iloc[rowIndex][0] == 'Molecules': #if the abscissa titles the molecule names
dfmolecules = dataFrame.iloc[rowIndex][1:] #select the row of names
molecules = dfmolecules.values #convert to matrix
molecules = molecules.astype(numpy.str) #save as class object with type string
molecules = molecules.astype(str) #save as class object with type string
molecules = list(molecules)
for moleculeIndex in range(len(molecules)):
molecules[moleculeIndex] = molecules[moleculeIndex].strip()#remove leading and trailing whitespaces.
Expand All @@ -2742,7 +2764,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
elif dataFrame.iloc[rowIndex][0] == 'moleculeIonizationType':
dfmoleculeIonizationType = dataFrame.iloc[rowIndex][1:] #select row of names
moleculeIonizationType = dfmoleculeIonizationType.values #convert to matrix
moleculeIonizationType = moleculeIonizationType.astype(numpy.str) #save as class object with type string
moleculeIonizationType = moleculeIonizationType.astype(str) #save as class object with type string
elif (dataFrame.iloc[rowIndex][0] == 'relativeIonizationEfficiencies') or (dataFrame.iloc[rowIndex][0] == 'knownIonizationFactorsRelativeToN2'):
dfrelativeIonizationEfficiencies = dataFrame.iloc[rowIndex][1:] #select row of names
relativeIonizationEfficiencies = dfrelativeIonizationEfficiencies.values #convert to matrix
Expand Down Expand Up @@ -2799,7 +2821,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
# #convert to matrix
# molecules = dfmolecules.values
# #save as class object with type string
# molecules = molecules.astype(numpy.str)
# molecules = molecules.astype(str)
#
# '''generate list of molecular weights'''
# #select row of names
Expand All @@ -2815,7 +2837,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
# #convert to matrix
# sourceInfo = dfsourceInfo.values
# #save as class object with type string
# sourceInfo = sourceInfo.astype(numpy.str)
# sourceInfo = sourceInfo.astype(str)

'''list of massfragments monitored is not part of reference file'''
mass_fragment_numbers_monitored = None
Expand All @@ -2837,15 +2859,15 @@ def FromXYXYtoXYYY(provided_reference_patterns):
if (dataFrame.iloc[rowIndex][0] == 'SourceOfFragmentationPatterns') or (dataFrame.iloc[rowIndex][0] == 'Source:'): #if the abscissa titles the source (both old and new reference files)
dfSourceOfFragmentationPatterns = dataFrame.iloc[rowIndex][1::2] #select the row of names
SourceOfFragmentationPatterns = dfSourceOfFragmentationPatterns.values #convert to matrix
SourceOfFragmentationPatterns = SourceOfFragmentationPatterns.astype(numpy.str) #save as class object with type string
SourceOfFragmentationPatterns = SourceOfFragmentationPatterns.astype(str) #save as class object with type string
elif dataFrame.iloc[rowIndex][0] == 'sourceOfIonizationData':
dfsourceOfIonizationData = dataFrame.iloc[rowIndex][1::2] #Select the row of names
sourceOfIonizationData = dfsourceOfIonizationData.values #convert to matrix
sourceOfIonizationData = sourceOfIonizationData.astype(numpy.str) #save as class object with type string
sourceOfIonizationData = sourceOfIonizationData.astype(str) #save as class object with type string
elif dataFrame.iloc[rowIndex][0] == 'Molecules': #if the abscissa titles the molecule names
dfmolecules = dataFrame.iloc[rowIndex][1::2] #select the row of names
molecules = dfmolecules.values #convert to matrix
molecules = molecules.astype(numpy.str) #save as class object with type string
molecules = molecules.astype(str) #save as class object with type string
molecules = list(molecules)
for moleculeIndex in range(len(molecules)):
molecules[moleculeIndex] = molecules[moleculeIndex].strip()#remove leading and trailing whitespaces.
Expand All @@ -2860,7 +2882,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
elif dataFrame.iloc[rowIndex][0] == 'moleculeIonizationType':
dfmoleculeIonizationType = dataFrame.iloc[rowIndex][1::2] #select row of names
moleculeIonizationType = dfmoleculeIonizationType.values #convert to matrix
moleculeIonizationType = moleculeIonizationType.astype(numpy.str) #save as class object with type string
moleculeIonizationType = moleculeIonizationType.astype(str) #save as class object with type string
elif (dataFrame.iloc[rowIndex][0] == 'relativeIonizationEfficiencies') or (dataFrame.iloc[rowIndex][0] == 'knownIonizationFactorsRelativeToN2'):
dfrelativeIonizationEfficiencies = dataFrame.iloc[rowIndex][1::2] #select row of names
relativeIonizationEfficiencies = dfrelativeIonizationEfficiencies.values #convert to matrix
Expand Down Expand Up @@ -2919,7 +2941,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
# #convert to matrix
# molecules = dfmolecules.values
# #save as class object with type string
# molecules = molecules.astype(numpy.str)
# molecules = molecules.astype(str)
#
# '''generate list of molecular weights'''
# #select row of names
Expand All @@ -2935,7 +2957,7 @@ def FromXYXYtoXYYY(provided_reference_patterns):
# #convert to matrix
# sourceInfo = dfsourceInfo.values
# #save as class object with type string
# sourceInfo = sourceInfo.astype(numpy.str)
# sourceInfo = sourceInfo.astype(str)

'''list of massfragments monitored is not part of reference file'''
mass_fragment_numbers_monitored = None
Expand Down Expand Up @@ -3096,6 +3118,8 @@ def __init__(self, provided_reference_patterns, electronnumbers, molecules, mole
#class object variable created to allow class to be used separately from the program.
self.ExportAtEachStep = ''
self.iterationSuffix = ''
if type(referenceFileName) != type(None):
self.referenceFileNameExtension = self.referenceFileName.split(".")[1]
#This loops through the molecules, and removes whitespaces from before and after the molecule's names.
for moleculeIndex, moleculeName in enumerate(self.molecules):
self.molecules[moleculeIndex] = moleculeName.strip()
Expand Down Expand Up @@ -3136,6 +3160,24 @@ def __init__(self, provided_reference_patterns, electronnumbers, molecules, mole
self.populateIonizationEfficiencies(self.AllMID_ObjectsDict)
self.exportIonizationInfo()

#a small helper function to check if an extension exists in a filename and to return the delimiter based on that.
def getDelimiterFromExtension(self, filename=''):
if filename == '':
filename = self.referenceFileName
if ".tsv" in filename:
delimiter = '\t'
elif ".tab" in filename:
delimiter = '\t'
elif ".txt" in filename:
delimiter = '\t'
elif ".skv" in filename:
delimiter = ';'
elif ".csv" in filename:
delimiter = ',' #it could be something else, but we will assume that a csv
else:
delimiter = '\t' #for MSRESOLVE, this is now the default delimiter.
return delimiter

#This function allows adding molecules to an existing reference patterns. When using TuningCorrector it is used to create MixedReference patterns.
#Though these variable names are plural, they are expected to be lists of one. "molecules" is supposed to be a list of variable names.
#provided_reference_patterns should be in an XYYY format. If starting with XYXY data, is okay to feed a single "XY" at a time and to do so repeatedly in a loop.
Expand Down Expand Up @@ -3231,7 +3273,8 @@ def ExportFragmentationPatterns(self, verbose=True):
print(self.runTimeAtExport[savePoint])
if self.ExportAtEachStep == 'yes':
#inserting the data for a particular savePoint
filename = 'Exported%s%s.csv'%(savePoint, self.labelToExport[savePoint])
delimiter = getDelimiterFromExtension(self.referenceFileNameExtension)
filename = 'Exported%s%s.%s'%(savePoint, self.labelToExport[savePoint], self.referenceFileNameExtension)
data = self.dataToExport[savePoint]
colIndex = ['%s'% y for y in self.moleculesToExport[savePoint]]
#colIndex = ['%s'% y for y in self.molecules]
Expand Down Expand Up @@ -5569,10 +5612,10 @@ def ExportXYYYData(outputFileName, data, dataHeader, abscissaHeader = 'Mass', fi
if dataType == 'Experiment':
extraLine = len(data[0,1:])

#If future applications of Export XYYY are desired, the new formats can be
#specified by additional keywords and if statements.
#If future applications of Export XYYY are desired, the new formats can be
#specified by additional keywords and if statements.

#if iterative analysis is being used and the suffix is wanted
#if iterative analysis is being used and the suffix is wanted
if not fileSuffix =='':
#then the filename will have a suffix attached
outputFileName = outputFileName[:-4] + fileSuffix + outputFileName[-4:]
Expand Down Expand Up @@ -5609,7 +5652,9 @@ def ExportXYYYData(outputFileName, data, dataHeader, abscissaHeader = 'Mass', fi
lineToInsert = numpy.array(lineToInsert.split(','))
fullArrayToExport = numpy.vstack((lineToInsert, fullArrayToExport))
#save the file to the correct name
numpy.savetxt(filename, fullArrayToExport, delimiter = ',', fmt ="%s")

delimiter = getDelimiterFromExtension(filename)
numpy.savetxt(filename, fullArrayToExport, delimiter = delimiter, fmt ="%s")


'''This function inserts rows of percentages into arrays of data'''
Expand Down
Loading

0 comments on commit 8461fd6

Please sign in to comment.