| // +----------------------------------------------------------------------+ // // $Id$ // /** * A self-contained class to parse a PDB file into an array of residues * each containing an array of atoms *
* Useful when dealing with big PDB files, where using the Science_Chemistry_PDBFile * class will generate out of memory errors. * * @author Jesus M. Castagnetto * @version 1.0 * @access public * @package Science_Chemistry * @see Science_Chemistry_PDBFile */ class Science_Chemistry_PDBParser { /** * PDB ID * * @var string * @access private */ var $pdb; /** * Full path to PDB file * * @var string * @access private */ var $file; /** * PDB file's date * * @var date * @access private */ var $date; /** * PDB macromolecule(s) class * * @var string * @access private */ var $class; /** * Array of macromolecules * * @var array * @access private */ var $macromolecules; /** * Number of macromolecules * * @var int * @access private */ var $num_macromolecules; /** * Constructor for the class, requires a PDB filename * * @param string $filename PDB filename * @param boolean $multi whether to parse all models in a multi-model file * @param boolean $meta whether to store the PDB file meta information * @param boolean $full whether to store the full set of fields per atom * @return object PDBParser * @access public * @see parseResidues() */ function Science_Chemistry_PDBParser($filename, $multi=false, $meta=false, $full=false) { if (!file_exists($filename)) return null; list($pdb,) = explode(".",basename($filename)); $this->pdb = $pdb; $this->file = realpath($filename); $arr = file($filename); // parsing the PDB file $month = array ( "JAN" => "01", "FEB" => "02", "MAR" => "03", "APR" => "04", "MAY" => "05", "JUN" => "06", "JUL" => "07", "AUG" => "08", "SEP" => "09", "OCT" => "10", "NOV" => "11", "DEC" => "12" ); $header_re = "/^HEADER[[:space:]]+(([^[:space:]]+ )+)[[:space:]]+"; $header_re .= "([0-9]{2}-[A-Z]{3}-[0-9]{2,4})[[:space:]]+[A-Z0-9]{4}/"; if (preg_match($header_re, $arr[0], $regs)) { $this->class = trim($regs[1]); // put date in a more standard format $tmp = explode("-", $regs[3]); if ($tmp[2] <= 23) $year = 2000 + (int)$tmp[2]; else $year = 1900 + (int)$tmp[2]; $this->date = $year."-".$month[$tmp[1]]."-".$tmp[0]; } $flag = "nomodel"; $tmparr = array(); for ($i=0; $i < count($arr); $i++) { if (!trim($arr[$i])) continue; $rectype = strtok($arr[$i]," "); // check if we have multi-model file if ($rectype == "MODEL") continue; // did we get a multi-model file and are parsing the end // of a model, if so, end parsing altogether if ($rectype == "ENDMDL") { if ($multi) { $this->macromolecules[] = $this->parseResidues($tmparr, $full); $this->num_macromolecules++; // = count($this->macromolecules); $tmparr = array(); } else { break; } continue; } // accumulate atom records, put the rest into the meta array if ($rectype == "ATOM" || $rectype == "HETATM") $tmparr[] = $arr[$i]; elseif ($meta) $this->meta[$rectype][] = $arr[$i]; } if (!empty($tmparr)) { $this->macromolecules[] = $this->parseResidues($tmparr, $full); $this->num_macromolecules++; // = count($this->macromolecules); } } /** * Makes the array of residues in the macromolecule * * @param array $records * @param boolean $full whether to store the full set of fields per atom * @see parseFile() * @see parseAtom() */ function parseResidues($records, $full) { $curr_res_id = ""; $residues = array(); $res_atoms = array(); for ($i=0; $i< count($records); $i++) { $atomrec =& $records[$i]; $res_name = trim(substr($atomrec,17,3)); $chain = trim(substr($atomrec,21,1)); $seq_num = (int) trim(substr($atomrec,22,4)); $res_id = $res_name.":".$seq_num.":".$chain; //if ($i == 0) // $curr_res_id = $res_id; if ($res_id == $curr_res_id) { $res_atoms[] = $atomrec; if ($i != (count($records) - 1)) continue; } if (($res_id != $curr_res_id) || ($i == (count($records) - 1)) ) { if (!empty($res_atoms)) { for ($j=0; $j < count($res_atoms); $j++) { $temp = $this->parseAtom($res_atoms[$j], $full, $atomname); $residues[$curr_res_id][$atomname] = $temp; } } $curr_res_id = $res_id; $res_atoms = array($atomrec); } } return $residues; } /** * Parses an atom record into an associative array * * @param string $atomrec PDB atom record * @param boolean $full whether to store the full set of fields per atom * @see parseResidues() */ function parseAtom($atomrec, $full, $atomname) { $atom = array(); // process PDB atom record // no error checking, assumes correct and standard record $atom["RecName"] = trim(substr($atomrec,0,6)); $atom["SerNum"] = (int) trim(substr($atomrec,6,5)); $atom["AtomName"] = trim(substr($atomrec,12,4)); $atomname = $atom["AtomName"]; if ($full) { $atom["AltLoc"] = trim(substr($atomrec,16,1)); $atom["ResName"] = trim(substr($atomrec,17,3)); $atom["ChainID"] = trim(substr($atomrec,21,1)); $atom["ResSeqNum"] = (int) trim(substr($atomrec,22,4)); $atom["InsCode"] = trim(substr($atomrec,26,1)); $atom["Occupancy"] = (float) trim(substr($atomrec,54,6)); $atom["TempFactor"] = (float) trim(substr($atomrec,60,6)); $atom["SegmentID"] = trim(substr($atomrec,72,4)); $atom["Charge"] = (float)trim(substr($atomrec,78,2)); $atom["Element"] = trim(substr($atomrec,76,2)); } $atom["X"] = (double) trim(substr($atomrec,30,8)); $atom["Y"] = (double) trim(substr($atomrec,38,8)); $atom["Z"] = (double) trim(substr($atomrec,46,8)); return $atom; } /** * Returns an array of residues with a particular name * from the indicated macromolecule index * * @param integer $macromol Index of the macromolecule in the $macromolecules array * @param string $resnam Residue name, e.g. HIS, CYS, etc. * @return array list of residues with the requested name * @access public * @see $macromolecules */ function getResidueList ($macromol, $resname) { $mol =& $this->macromolecules[$macromol]; $reslist = array(); if (!$mol) return $reslist; foreach($mol as $resid=>$atoms) { list($curr_res_name,,) = explode(":",$resid); // echo $curr_res_name."***\n"; if ($curr_res_name == $resname) $reslist[$resid] = $atoms; else continue; } return $reslist; } } // end of PDBParser // vim: expandtab: ts=4: sw=4 ?>