ProteoWizard
MSDataFileTest.cpp
Go to the documentation of this file.
1//
2// $Id$
3//
4//
5// Original author: Darren Kessner <darren@proteowizard.org>
6//
7// Copyright 2007 Spielberg Family Center for Applied Proteomics
8// Cedars-Sinai Medical Center, Los Angeles, California 90048
9//
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13//
14// http://www.apache.org/licenses/LICENSE-2.0
15//
16// Unless required by applicable law or agreed to in writing, software
17// distributed under the License is distributed on an "AS IS" BASIS,
18// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19// See the License for the specific language governing permissions and
20// limitations under the License.
21//
22
23
24#include "MSDataFile.hpp"
25#include "Diff.hpp"
26#include "IO.hpp"
27#include "SpectrumListBase.hpp"
29#include "examples.hpp"
33#include <boost/iostreams/filtering_stream.hpp>
34#include <boost/iostreams/filter/gzip.hpp>
35#include <boost/iostreams/device/file_descriptor.hpp>
36#include <boost/iostreams/copy.hpp>
37
38
39using namespace pwiz::util;
40using namespace pwiz::cv;
41using namespace pwiz::data;
42using namespace pwiz::msdata;
43
44
45ostream* os_ = 0;
46
47
48string filenameBase_ = "temp.MSDataFileTest";
49
50
52{
53 // remove metadata ptrs appended on read
54 vector<SourceFilePtr>& sfs = msd.fileDescription.sourceFilePtrs;
55 if (!sfs.empty()) sfs.erase(sfs.end()-1);
56 vector<SoftwarePtr>& sws = msd.softwarePtrs;
57 if (!sws.empty()) sws.erase(sws.end()-1);
58
59 // remove current DataProcessing created on read
60 SpectrumListBase* sl = dynamic_cast<SpectrumListBase*>(msd.run.spectrumListPtr.get());
64}
65
67{
68 string filename1 = filenameBase_ + ".mgf";
69 string filename2 = filenameBase_ + ".mzXML";
70
71 ofstream ofs(filename1.c_str());
72 string mgf = "CHARGE=2+ and 3+\nBEGIN IONS\nPEPMASS=952.924194 145032.0000\nCHARGE=2+\nRTINSECONDS=301.48\n271.0874 2\n298.1747 4\nEND IONS\nBEGIN IONS\nPEPMASS=503.800000 67522.2000\nCHARGE=2+\nRTINSECONDS=302.51\n147.1840 3\n154.3668 3\n162.2118 2\n162.9007 1\n167.3297 1\n175.2387 2\n184.9460 3\nEND IONS\n";
73 ofs.write(mgf.c_str(), mgf.length());
74 ofs.close();
75
76 // make sure that round trip doesn't systematically increase converted scan numbers
77 for (int loop = 3; loop--; )
78 {
79 MSDataFile msd1(filename1); // read back the MGF
80 const SpectrumList& sl = *msd1.run.spectrumListPtr;
81 SpectrumPtr spectrum = sl.spectrum(0);
82 unit_assert(spectrum->id == "index=0");
83 MSDataFile::WriteConfig writeConfig;
84 writeConfig.format = MSDataFile::Format_mzXML;
85 MSDataFile::write(msd1, filename2, writeConfig); // write as mzXML
86 MSDataFile msd2(filename2); // read back the mzXML
87 const SpectrumList& sl2= *msd2.run.spectrumListPtr;
88 SpectrumPtr spectrum2 = sl2.spectrum(0);
89 unit_assert(spectrum2->id == "index=1"); // mzXML is 1-based
90 MSDataFile::WriteConfig writeConfig2;
91 writeConfig2.format = MSDataFile::Format_MGF;
92 MSDataFile::write(msd2, filename1, writeConfig2); // write as mgf
93 }
94
95 // remove temp files
96 boost::filesystem::remove(filename1);
97 boost::filesystem::remove(filename2);
98}
99
100
102 const DiffConfig diffConfig)
103{
104 if (os_) *os_ << "validateWriteRead()\n " << writeConfig << endl;
105
106 string filename1 = filenameBase_ + ".1";
107 string filename2 = filenameBase_ + ".2";
108 string filename3 = filenameBase_ + ".3";
109 string filename4 = filenameBase_ + ".\xE4\xB8\x80\xE4\xB8\xAA\xE8\xAF\x95.4";
110 // FIXME: 4-byte UTF-8 not working: string filename5 = filenameBase_ + ".\x01\x04\xA4\x01\x04\xA2.5";
111
112 {
113 // create MSData object in memory
114 MSData tiny;
116
117 if (writeConfig.format == MSDataFile::Format_mzXML)
118 {
119 // remove s22 since it is not written to mzXML
120 static_cast<SpectrumListSimple&>(*tiny.run.spectrumListPtr).spectra.pop_back();
121 }
122
123 // write to file #1 (static)
124 MSDataFile::write(tiny, filename1, writeConfig);
125
126 // simulate CLI garbage collect behavior, wherein delayed deletes stress
127 // memory and file handle usage
128 {
129 std::vector< boost::shared_ptr< MSDataFile > > msds;
130 for (int i=0;i<100;i++)
131 {
132 boost::shared_ptr<MSDataFile> msd1(new MSDataFile(filename1));
133 msds.push_back(msd1);
134 hackInMemoryMSData(*msd1);
135 Diff<MSData, DiffConfig> diff(tiny, *msd1, diffConfig);
136 }
137 }
138
139 // read back into an MSDataFile object
140 MSDataFile msd1(filename1);
141 hackInMemoryMSData(msd1);
142
143 // compare
144 Diff<MSData, DiffConfig> diff(tiny, msd1, diffConfig);
145 if (diff && os_) *os_ << diff << endl;
147
148 // write to file #2 (member)
149 msd1.write(filename2, writeConfig);
150
151 // read back into another MSDataFile object
152 MSDataFile msd2(filename2);
153 hackInMemoryMSData(msd2);
154
155 // compare
156 diff(tiny, msd2);
157 if (diff && os_) *os_ << diff << endl;
159
160 // now give the gzip read a workout
161 bio::filtering_istream tinyGZ(bio::gzip_compressor() | bio::file_descriptor_source(filename1));
162 bio::copy(tinyGZ, bio::file_descriptor_sink(filename1+".gz", ios::out|ios::binary));
163
164 MSDataFile msd3(filename1+".gz");
165 hackInMemoryMSData(msd3);
166
167 // compare
168 diff(tiny, msd3);
169 if (diff && os_) *os_ << diff << endl;
171
172 // test writing to a stream
173 ostringstream oss;
174 msd1.write(oss, writeConfig);
175 string ossStr = oss.str();
176 ofstream ofs(filename3.c_str());
177 ofs.write(ossStr.c_str(), ossStr.length());
178 ofs.close();
179
180 // read back into another MSDataFile object
181 MSDataFile msd4(filename3);
182 hackInMemoryMSData(msd4);
183
184 // compare
185 diff(tiny, msd4);
186 if (diff && os_) *os_ << diff << endl;
188
189
190 // write to file #4 (testing two byte UTF-8 code points)
191 msd1.write(filename4, writeConfig);
192
193 // read back into another MSDataFile object
194 MSDataFile msd5(filename4);
195 hackInMemoryMSData(msd5);
196
197 // compare
198 diff(tiny, msd5);
199 if (diff && os_) *os_ << diff << endl;
201
202
203 // write to file #5 (testing four byte UTF-8 code points)
204 /*msd1.write(filename5, writeConfig);
205
206 // read back into another MSDataFile object
207 MSDataFile msd6(filename5);
208 hackInMemoryMSData(msd6);
209
210 // compare
211 diff(tiny, msd6);
212 if (diff && os_) *os_ << diff << endl;
213 unit_assert(!diff);*/
214 }
215
216 // remove temp files
217 boost::filesystem::remove(filename1);
218 boost::filesystem::remove(filename2);
219 boost::filesystem::remove(filename1 + ".gz");
220 boost::filesystem::remove(filename3);
221 boost::filesystem::remove(filename4);
222 //boost::filesystem::remove(filename5);
223}
224
225void test()
226{
227 MSDataFile::WriteConfig writeConfig;
228 DiffConfig diffConfig;
229
231
232 // mzML 64-bit, full diff
233 validateWriteRead(writeConfig, diffConfig);
234
235 writeConfig.indexed = false;
236 validateWriteRead(writeConfig, diffConfig); // no index
237 writeConfig.indexed = true;
238
239 // mzML 32-bit, full diff
241 validateWriteRead(writeConfig, diffConfig);
242
243 // mzXML 32-bit, diff ignoring metadata and chromatograms
244 writeConfig.format = MSDataFile::Format_mzXML;
245 diffConfig.ignoreMetadata = true;
246 diffConfig.ignoreChromatograms = true;
247 validateWriteRead(writeConfig, diffConfig);
248
249 // mzXML 64-bit, diff ignoring metadata and chromatograms
251 validateWriteRead(writeConfig, diffConfig);
252
253 writeConfig.indexed = false;
254 validateWriteRead(writeConfig, diffConfig); // no index
255 writeConfig.indexed = true;
256}
257
258
259void demo()
260{
261 MSData tiny;
263
265 MSDataFile::write(tiny, filenameBase_ + ".64.mzML", config);
266
268 MSDataFile::write(tiny, filenameBase_ + ".32.mzML", config);
269
270 config.format = MSDataFile::Format_Text;
271 MSDataFile::write(tiny, filenameBase_ + ".txt", config);
272
273 config.format = MSDataFile::Format_mzXML;
274 MSDataFile::write(tiny, filenameBase_ + ".32.mzXML", config);
275
277 MSDataFile::write(tiny, filenameBase_ + ".64.mzXML", config);
278}
279
280
281const char rawHeader_[] = {'\x01', '\xA1',
282 'F', '\0', 'i', '\0', 'n', '\0', 'n', '\0',
283 'i', '\0', 'g', '\0', 'a', '\0', 'n', '\0'};
284
285
286class TestReader : public Reader
287{
288 public:
289
291
292 virtual std::string identify(const std::string& filename, const std::string& head) const
293 {
294 if (filename.size()<=4 || filename.substr(filename.size()-4)!=".RAW")
295 return std::string("");
296
297 for (size_t i=0; i<sizeof(rawHeader_); i++)
298 if (head[i] != rawHeader_[i])
299 return std::string("");
300
301 count++;
302 return filename;
303 }
304
305 virtual void read(const std::string& filename, const std::string& head, MSData& result, int runIndex = 0,
306 const Config& config = Config()) const
307 {
308 count++;
309 }
310
311 virtual void read(const std::string& filename,
312 const std::string& head,
313 std::vector<MSDataPtr>& results,
314 const Config& config = Config()) const
315 {
316 results.push_back(MSDataPtr(new MSData));
317 read(filename, head, *results.back(), 0, config);
318 }
319
320 const char *getType() const {return "testReader";} // satisfy inheritance
321
322 mutable int count;
323};
324
325
327{
328 // create a file
329 string filename = filenameBase_ + ".RAW";
330 ofstream os(filename.c_str());
331 os.write(rawHeader_, 18);
332 os.close();
333
334 // open the file with our Reader
335 TestReader reader;
336 MSDataFile msd(filename, &reader);
337
338 // verify that our reader got called properly
339 unit_assert(reader.count == 2);
340
341 // remove temp file
342 boost::filesystem::remove(filename);
343
344 if (os_) *os_ << endl;
345}
346
347
349{
350 if (os_) *os_ << "testSHA1()\n";
351
352 // write out a test file
353
354 string filename = filenameBase_ + ".SHA1Test";
355 MSData tiny;
357 MSDataFile::write(tiny, filename);
358
359 {
360 // read in without SHA-1 calculation
361 MSDataFile msd(filename);
362
363 if (os_)
364 {
365 *os_ << "no SHA-1:\n";
367 IO::write(writer, *msd.fileDescription.sourceFilePtrs.back());
368 }
369
371 unit_assert(!msd.fileDescription.sourceFilePtrs.back()->hasCVParam(MS_SHA_1));
372
373 // read in with SHA-1 calculation
374
375 MSDataFile msd_sha1(filename, 0, true);
376
377 if (os_)
378 {
379 *os_ << "with SHA-1:\n";
381 IO::write(writer, *msd_sha1.fileDescription.sourceFilePtrs.back());
382 }
383
384 unit_assert(!msd_sha1.fileDescription.sourceFilePtrs.empty());
385 unit_assert(msd_sha1.fileDescription.sourceFilePtrs.back()->hasCVParam(MS_SHA_1));
386 }
387
388 // clean up
389
390 boost::filesystem::remove(filename);
391 if (os_) *os_ << endl;
392}
393
394
395int main(int argc, char* argv[])
396{
397 TEST_PROLOG(argc, argv)
398
399 try
400 {
401 if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
402 test();
403 //demo();
404 testReader();
405 testSHA1();
406 }
407 catch (exception& e)
408 {
409 TEST_FAILED(e.what())
410 }
411 catch (...)
412 {
413 TEST_FAILED("Caught unknown exception.")
414 }
415
417}
418
void diff(const string &filename1, const string &filename2)
string filename1
int main(int argc, char *argv[])
void testSHA1()
void validateWriteRead(const MSDataFile::WriteConfig &writeConfig, const DiffConfig diffConfig)
void hackInMemoryMSData(MSData &msd)
void validateMmgfMzxmlRoundTrip()
const char rawHeader_[]
void demo()
void testReader()
ostream * os_
void test()
string filenameBase_
virtual std::string identify(const std::string &filename, const std::string &head) const
const char * getType() const
virtual void read(const std::string &filename, const std::string &head, std::vector< MSDataPtr > &results, const Config &config=Config()) const
virtual void read(const std::string &filename, const std::string &head, MSData &result, int runIndex=0, const Config &config=Config()) const
The XMLWriter class provides simple, tag-level XML syntax writing.
Definition XMLWriter.hpp:48
common functionality for base ChromatogramList implementations
virtual void setDataProcessingPtr(DataProcessingPtr dp)
set DataProcessing
common functionality for base SpectrumList implementations
virtual void setDataProcessingPtr(DataProcessingPtr dp)
set DataProcessing
Interface for accessing spectra, which may be stored in memory or backed by a data file (RAW,...
Definition MSData.hpp:661
virtual SpectrumPtr spectrum(size_t index, bool getBinaryData=false) const =0
retrieve a spectrum by index
interface for file readers
Definition Reader.hpp:37
MS_SHA_1
SHA-1: SHA-1 (Secure Hash Algorithm-1) is a cryptographic hash function designed by the National Secu...
Definition cv.hpp:2316
PWIZ_API_DECL void write(minimxml::XMLWriter &writer, const CV &cv)
PWIZ_API_DECL void initializeTiny(MSData &msd)
boost::shared_ptr< DataProcessing > DataProcessingPtr
Definition MSData.hpp:288
boost::shared_ptr< MSData > MSDataPtr
Definition MSData.hpp:913
boost::shared_ptr< Spectrum > SpectrumPtr
Definition MSData.hpp:573
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition diff_std.hpp:143
configuration struct for diffing MSData types
Definition Diff.hpp:206
bool ignoreMetadata
ignore all file level metadata, and most scan level metadata, i.e.
Definition Diff.hpp:214
std::vector< SourceFilePtr > sourceFilePtrs
list and descriptions of the source files this mzML document was generated or derived from.
Definition MSData.hpp:90
configuration for write()
BinaryDataEncoder::Config binaryDataEncoderConfig
MSData object plus file I/O.
static void write(const MSData &msd, const std::string &filename, const WriteConfig &config=WriteConfig(), const pwiz::util::IterationListenerRegistry *iterationListenerRegistry=0)
static write function for any MSData object; iterationListenerRegistry may be used for progress updat...
This is the root element of ProteoWizard; it represents the mzML element, defined as: intended to cap...
Definition MSData.hpp:850
Run run
a run in mzML should correspond to a single, consecutive and coherent set of scans on an instrument.
Definition MSData.hpp:886
std::vector< SoftwarePtr > softwarePtrs
list and descriptions of software used to acquire and/or process the data in this mzML file.
Definition MSData.hpp:871
FileDescription fileDescription
information pertaining to the entire mzML file (i.e. not specific to any part of the data set) is sto...
Definition MSData.hpp:862
ChromatogramListPtr chromatogramListPtr
all chromatograms for this run.
Definition MSData.hpp:830
SpectrumListPtr spectrumListPtr
all mass spectra and the acquisitions underlying them are described and attached here....
Definition MSData.hpp:827
Simple writeable in-memory implementation of SpectrumList.
Definition MSData.hpp:717
#define unit_assert(x)
Definition unit.hpp:85
#define TEST_EPILOG
Definition unit.hpp:183
#define TEST_FAILED(x)
Definition unit.hpp:177
#define TEST_PROLOG(argc, argv)
Definition unit.hpp:175