diff --git a/examples/flex-EFP/Scripts/cut_qm.py b/examples/flex-EFP/Scripts/cut_qm.py index 6c7ccb5..427390f 100644 --- a/examples/flex-EFP/Scripts/cut_qm.py +++ b/examples/flex-EFP/Scripts/cut_qm.py @@ -7,11 +7,11 @@ Sample execution: cut_qm.py ala_33_473.inp a0001.efp -This file reads in an input file and a reference efp file that should be a good geometric match (low RMSD), -AND the .inp file has listed atoms that should be deleted, or polarizable points that should be deleted. -Atoms that will be added to the QM region should be removed from the fragment file. Additionally, a bridge -between a QM-atom and an MM-atom that are covalently bound should have the QM atom removed, and some -information of the MM-atom should be removed. The coordinates, monopole, and screen parameter sections +This file reads in an input file and a reference EFP file that should be a good geometric match (low RMSD), +AND the .inp file has listed atoms and polarizable points that should be deleted. +Atoms that will be added to the QM region should be removed from the fragment file. +Additionally, a bridge between a QM-atom and an MM-atom that are covalently bound should have the QM atom removed, +and some information of the MM-atom should be removed. The coordinates, monopole, and screen parameter sections for the MM-atom are retained, while the dipoles, quadrupoles, octupoles, and polarizable point sections should be removed. """ @@ -19,51 +19,117 @@ import sys import numpy as np -def distance(x1,y1,z1,x2,y2,z2): - dist=np.sqrt((float(x1)-float(x2))**2+(float(y1)-float(y2))**2+(float(z1)-float(z2))**2) +def distance(x1, y1, z1, x2, y2, z2): + dist = np.sqrt((float(x1) - float(x2))**2 + + (float(y1) - float(y2))**2 + + (float(z1) - float(z2))**2) return dist -def get_specials(lines): # Return the atom indexes of atoms to be removed entirely and atoms to remove polarizations - atoms=[] - pols=[] +def get_specials(lines): + """ + Grab input file lines to obtain the atom IDs that should be removed entirely + and the atom indexes for which only the polarizable points should be removed. + + The function looks for keywords 'erased:' and 'remove:' in the lines. + -make_AAs_V2.py added these comments. + + Reads: + lines: input file (.inp) lines. + Returns: + atomid (list): Atom indexes for complete removal. + polid (list): Atom indexes for polarizable points removal. + """ + atoms = [] # List for names of atoms to be removed entirely. + pols = [] # List for names of atoms whose polarizable points are to be removed. + + # First, extract the atom names from lines that contain 'erased:' and 'remove:' for line in lines: - j=-1 + j = -1 if 'erased:' in line: - while(line.split()[j]!='erased:'): + # Walk backwards from the end of the line until reaching 'erased:' + while line.split()[j] != 'erased:': atoms.append(line.split()[j]) - j-=1 + j -= 1 elif 'remove:' in line: - while(line.split()[j]!='remove:'): + # Walk backwards from the end of the line until reaching 'remove:' + while line.split()[j] != 'remove:': pols.append(line.split()[j]) - j-=1 - i=0 - atomid=[] - polid=[] - start=0 + j -= 1 + + # Scan through the file lines to grab the positions of the pesky atoms/polarizabilities. + i = 0 + atomid = [] # List to record the atom index numbers for full removal. + polid = [] # List to record the atom index numbers for polarizable point removal. + start = 0 # Flag to indicate when the coordinates section starts. for line in lines: if '$end' in line: - start=0 - elif(start==1): - i+=1 + start = 0 + elif start: + i += 1 # Increment a counter for each coordinate line + # Check the atom name (first token in the line) if line.split()[0] in atoms: atomid.append(i) elif line.split()[0] in pols: polid.append(i) + # Additionally, if the atom label contains 'H000', mark it for removal + # -All virual hydrogens are to be removed. elif 'H000' in line: atomid.append(i) - elif('C1'==line.split()[0]): - start=1 - return atomid,polid + # The coordinates section is assumed to start when a line: 'C1' + elif 'C1' == line.split()[0]: + start = 1 + return atomid, polid -def get_coords(lines,atoms,pols): - start=0 - coords=[] - rem_coords=[] +def get_coords(lines, atoms, pols): + """ + Grab the coordinate lines while separating those that should be removed (rem_coords) + from those to be kept (coords). + + Reads: + lines: Lines from the EFP file. + atoms: Atom indexes to remove entire atom. + pols: Atom indexes for which only polarizable points should be removed. + + Returns: + coords, rem_coords; where: + - coords: List of coordinate lines to keep. + - rem_coords: List of coordinate lines that are flagged for removal. + """ + start = False + coords = [] + rem_coords = [] + + # Helper function to extract two atom numbers from bond midpoints, bond identifier starts with 'B' + def extract_atomnums_B(line): + """ + Given a line starting with 'B', extract two atom numbers based on the length + of the first token. + """ + token = line.split()[0] + if len(token) == 4: + return int(token[2]), int(token[3]) + elif len(token) == 5: + return int(token[2:4]), int(token[4]) + elif len(token) == 6: + return int(token[2:4]), int(token[4:6]) + else: + return None, None + + # Process each line in the file. for line in lines: + # Exit when "STOP" is encountered. if 'STOP' in line: - return coords,rem_coords - elif(start==1) and (line[0]=='A'): - atomnum=int(line[1:3]) + return coords, rem_coords + # Begin processing with the "COORDINATES" section. + if 'COORDINATES' in line: + start = True + continue + if not start: + continue + + # If the line starts with 'A', extract the two-digit atom number. + if line.startswith('A'): + atomnum = int(line[1:3]) if atomnum in atoms: rem_coords.append(line) elif atomnum in pols: @@ -71,238 +137,313 @@ def get_coords(lines,atoms,pols): rem_coords.append(line) else: coords.append(line) - elif(start==1) and (line[0]=='B'): - if(len(line.split()[0])==4): - atomnum=int(line[2]) - atomnum2=int(line[3]) - if atomnum in atoms: - rem_coords.append(line) - elif atomnum2 in atoms: - rem_coords.append(line) - elif atomnum in pols: - rem_coords.append(line) - coords.append(line) - elif atomnum2 in pols: - rem_coords.append(line) - coords.append(line) - else: - coords.append(line) - elif(len(line.split()[0])==5): - atomnum=int(line[2:4]) - atomnum2=int(line[4]) - if atomnum in atoms: - rem_coords.append(line) - elif atomnum2 in atoms: - rem_coords.append(line) - elif atomnum in pols: - rem_coords.append(line) - coords.append(line) - elif atomnum2 in pols: - rem_coords.append(line) - coords.append(line) - else: - coords.append(line) - elif(len(line.split()[0])==6): - atomnum=int(line[2:4]) - atomnum2=int(line[4:6]) - if atomnum in atoms: - rem_coords.append(line) - elif atomnum2 in atoms: - rem_coords.append(line) - elif atomnum in pols: - rem_coords.append(line) - coords.append(line) - elif atomnum2 in pols: - rem_coords.append(line) - coords.append(line) - else: - coords.append(line) - elif 'COORDINATES' in line: - start=1 -def get_monopoles(lines,coords): - monopoles=[] - keep_names=[] - start=0 - for atom in coords: - keep_names.append(atom.split()[0]) + # For lines starting with 'B', use the helper to extract atom numbers. + elif line.startswith('B'): + atomnum, atomnum2 = extract_atomnums_B(line) + if atomnum is None: + continue # Skip if extraction failed + # If either atom number is in the removal list, add to rem_coords. + if atomnum in atoms or atomnum2 in atoms: + rem_coords.append(line) + # If either atom number is in the polarizable removal list, add to both lists. + elif atomnum in pols or atomnum2 in pols: + coords.append(line) + rem_coords.append(line) + else: + coords.append(line) + + return coords, rem_coords + +def get_monopoles(lines, coords): + """ + Extract the monopole section for the atoms given coordinates. + + Reads: + lines (list of str): Lines from the EFP file. + coords (list of str): List of coordinate lines to keep. + + Returns: + monopoles (list of str): Lines from the MONOPOLES section corresponding to the kept atoms. + """ + monopoles = [] + keep_names = [atom.split()[0] for atom in coords] + start = 0 for line in lines: - if(start==1): + if start == 1: if 'STOP' in line: return monopoles if line.split()[0] in keep_names: monopoles.append(line) if 'MONOPOLES' in line: - start=1 - -def get_dipoles(lines,coords): - dipoles=[] - cut_names=[] - start=0 - for atom in coords: - cut_names.append(atom.split()[0]) + start = 1 + +def get_dipoles(lines, cut_coords): + """ + Extract the dipoles section lines that should be removed. + + Reads: + lines (list of str): Lines from the EFP file. + cut_coords (list of str): Coordinate lines of atoms to remove. + + Returns: + dipoles (list of str): Lines from the DIPOLES section not matching the kept atoms. + """ + dipoles = [] + cut_names = [atom.split()[0] for atom in cut_coords] + start = 0 for line in lines: - if(start==1): + if start == 1: if 'STOP' in line: return dipoles + # Skip lines if the atom name is in the list of coordinates to cut. if line.split()[0] in cut_names: continue else: dipoles.append(line) if 'DIPOLES' in line: - start=1 + start = 1 -def get_quadrupoles(lines,coords): - quadrupoles=[] - cut_names=[] - start=0 - k=0 - j=0 - for atom in coords: - cut_names.append(atom.split()[0]) +def get_quadrupoles(lines, cut_coords): + """ + Extract the quadrupoles section lines. + + Uses counters (k and j) to control how many lines to skip or include + when the atom name is encountered. Quardupoles are multi-line entries. + + Reads: + lines (list of str): Lines from the EFP file. + cut_coords (list of str): Coordinate lines of atoms to remove. + + Returns: + quadrupoles (list of str): Lines from the QUADRUPOLES section. + """ + quadrupoles = [] + cut_names = [atom.split()[0] for atom in cut_coords] + start = 0 + k = 0 # Counter to skip one line after a match + j = 0 # Counter to include one line after a non-match for line in lines: - if(start==1): + if start == 1: if 'STOP' in line: return quadrupoles - elif(k==1): - k-=1 - elif(j==1): + elif k == 1: + k -= 1 + elif j == 1: quadrupoles.append(line) - j-=1 + j -= 1 + # When encountering a cut name, set k to skip the next line; do not append. elif line.split()[0] in cut_names: - k=1 + k = 1 else: quadrupoles.append(line) - j=1 + j = 1 if 'QUADRUPOLES' in line: - start=1 + start = 1 -def get_octupoles(lines,coords): - octupoles=[] - cut_names=[] - start=0 - j=0 - k=0 - for atom in coords: - cut_names.append(atom.split()[0]) +def get_octupoles(lines, cut_coords): + """ + Extract the octupoles section lines. + + Uses counters (j and k) similar to quadrupoles to control skipping/inclusion. + -octupoles are also multi-lin entries. + + Reads: + lines (list of str): Lines from the EFP file. + cut_coords (list of str): Coordinate lines of atoms to remove. + + Returns: + octupoles (list of str): Lines from the OCTUPOLES section. + """ + octupoles = [] + cut_names = [atom.split()[0] for atom in cut_coords] + start = 0 + j = 0 + k = 0 for line in lines: - if(start==1): + if start == 1: if 'STOP' in line: return octupoles - elif(k>0): - k-=1 - elif(j>0): + elif k > 0: + k -= 1 + elif j > 0: octupoles.append(line) - j-=1 + j -= 1 elif line.split()[0] in cut_names: - k=2 + k = 2 else: octupoles.append(line) - j=2 + j = 2 if 'OCTUPOLES' in line: - start=1 + start = 1 -def get_polarpts(lines,cut_coords): - polars=[] - remove_names=[] - start=0 - j=0 - for atom in cut_coords: - remove_names.append(atom.split()[0]) +def get_polarpts(lines, cut_coords): + """ + Extract the polarizable points section. + + For each polarizable point, compute the distance to each atom in the cut_coords. + If the minimum distance is greater than 3, the point is retained. + -Polarizable points do not have "atomnames" the same as the multipole parameters. + + Reads: + lines (list of str): Lines from the EFP file. + cut_coords (list of str): Coordinates of atoms to remove. + + Returns: + polars (list of str): Lines from the POLARIZABLE POINTS section that pass the filter. + """ + polars = [] + #remove_names = [atom.split()[0] for atom in cut_coords] + start = 0 + j = 0 for line in lines: - mindist=20.0 - if(start==1): + mindist = 20.0 # Initialize minimum distance with a high value. + if start == 1: if 'STOP' in line: return polars - elif(j>0): + elif j > 0: polars.append(line) - j-=1 - elif(line[0:2]=='CT'): + j -= 1 + # Check lines starting with "CT" (assumed polarizable point lines) + elif line[0:2] == 'CT': for atom in cut_coords: - current_dist=distance(line.split()[1],line.split()[2],line.split()[3],atom.split()[1],atom.split()[2],atom.split()[3]) - if(current_dist3): + # Calculate the distance between the polarizable point and an atom in cut_coords. + current_dist = distance(line.split()[1], line.split()[2], line.split()[3], + atom.split()[1], atom.split()[2], atom.split()[3]) + if current_dist < mindist: + mindist = current_dist + # If the minimum distance is greater than 3, keep this polarizable point. + if mindist > 3: polars.append(line) - j=3 + j = 3 # Use a counter to skip the next 3 lines. if 'POLARIZABLE POINTS' in line: - start=1 + start = 1 -def get_screen(lines,coords,title): - params=[] - keep_names=[] - start=0 - for atom in coords: - keep_names.append(atom.split()[0]) +def get_screen(lines, coords, title): + """ + Extract screening parameter lines from the file. + + Only lines with atom names in coords are kept. + + Reads: + lines (list of str): Lines from the EFP file. + coords (list of str): Coordinates used to filter the screening parameters. + title (str): A string that identifies the screen section (e.g., 'SCREEN ' or 'SCREEN2'). + + Returns: + params (list of str): Filtered screening parameter lines. + """ + params = [] + keep_names = [atom.split()[0] for atom in coords] + start = 0 for line in lines: - if(start==1): + if start == 1: if 'STOP' in line: return params if line.split()[0] in keep_names: params.append(line) if title in line: - start=1 + start = 1 -def get_header(lines,input_name): - header=[] - fragname=input_name.split('.')[0] +def get_header(lines, input_name): + """ + Construct the header for the output file by replacing the placeholder '$FRAGNAME' + with the fragment name (derived from the input file name); otherwise retain the same + heading lines as-is from the .efp file. + + Reads: + lines (list of str): Lines from the EFP file. + input_name (str): The input file name. + + Returns: + header (list of str): The modified header lines. + """ + header = [] + fragname = input_name.split('.')[0] for line in lines: - header.append(line.replace('$FRAGNAME',fragname)) - #print(line) + header.append(line.replace('$FRAGNAME', fragname)) + # Stop adding header lines once the COORDINATES section is encountered. if 'COORDINATES (BOHR)' in line: return header def main(inp, efp): - with open(inp,'r') as inp_: - inp_lines=inp_.readlines() - with open(efp,'r') as efp_: - efp_lines=efp_.readlines() - header=get_header(efp_lines,inp) + """ + Main processing function. + + 1. Read the input (.inp) file and the reference EFP file. + 2. Build the header by replacing '$FRAGNAME' with the fragment name. + 3. Parse the input file to determine which atoms/polarizable points are to be removed. + 4. Extract coordinate lines (keeping and removal sets). + 5. Extract monopoles, dipoles, quadrupoles, octupoles, and polarizable points. + 6. Extract screening parameter sections. + 7. Write the output file 'cut_' containing the header and the filtered sections. + """ + # Read input files + with open(inp, 'r') as inp_: + inp_lines = inp_.readlines() + with open(efp, 'r') as efp_: + efp_lines = efp_.readlines() + + # Construct header by replacing '$FRAGNAME' with the actual fragment name. + header = get_header(efp_lines, inp) + + # Get atom indexes for complete removal and for polarizable point removal. rem_atoms, rem_pols = get_specials(inp_lines) - keep_coords,rem_coords=get_coords(efp_lines,rem_atoms,rem_pols) - keep_monop=get_monopoles(efp_lines,keep_coords) - keep_dip=get_dipoles(efp_lines,rem_coords) - keep_quadrup=get_quadrupoles(efp_lines,rem_coords) - keep_octup=get_octupoles(efp_lines,rem_coords) - keep_pols=get_polarpts(efp_lines,rem_coords) - keep_screen=get_screen(efp_lines,keep_coords,'SCREEN ') - keep_screen2=get_screen(efp_lines,keep_coords,'SCREEN2') - with open('cut_'+efp,'w') as outfile: + + # Extract coordinate lines: + # keep_coords: coordinate lines to be kept, + # rem_coords: coordinate lines flagged for removal. + keep_coords, rem_coords = get_coords(efp_lines, rem_atoms, rem_pols) + + # Extract sections for monopoles, dipoles, quadrupoles, octupoles, and polarizable points. + keep_monop = get_monopoles(efp_lines, keep_coords) + keep_dip = get_dipoles(efp_lines, rem_coords) + keep_quadrup = get_quadrupoles(efp_lines, rem_coords) + keep_octup = get_octupoles(efp_lines, rem_coords) + keep_pols = get_polarpts(efp_lines, rem_coords) + + # Extract screening parameters from two different screen sections. + keep_screen = get_screen(efp_lines, keep_coords, 'SCREEN ') + keep_screen2 = get_screen(efp_lines, keep_coords, 'SCREEN2') + + # Write the final output file with the appropriate sections. + with open('cut_' + efp, 'w') as outfile: + # Write header lines. for outline in header: outfile.write(outline) + # Write coordinates. for outline in keep_coords: outfile.write(outline) - outfile.write(' STOP\n'+ - ' MONOPOLES\n') + # Write monopoles section. + outfile.write(' STOP\nMONOPOLES\n') for outline in keep_monop: outfile.write(outline) - outfile.write(' STOP\n'+ - ' DIPOLES\n') + # Write dipoles section. + outfile.write(' STOP\nDIPOLES\n') for outline in keep_dip: outfile.write(outline) - outfile.write(' STOP\n'+ - ' QUADRPOLES\n') + # Write quadrupoles section. + outfile.write(' STOP\nQUADRPOLES\n') for outline in keep_quadrup: outfile.write(outline) - outfile.write(' STOP\n'+ - ' OCTUPOLES\n') + # Write octupoles section. + outfile.write(' STOP\nOCTUPOLES\n') for outline in keep_octup: outfile.write(outline) - outfile.write(' STOP\n'+ - ' POLARIZABLE POINTS\n') + # Write polarizable points section. + outfile.write(' STOP\nPOLARIZABLE POINTS\n') for outline in keep_pols: outfile.write(outline) - outfile.write(' STOP\n'+ - 'SCREEN2 (FROM VDWSCL= 0.700)\n') + # Write screening sections. + outfile.write(' STOP\nSCREEN2 (FROM VDWSCL= 0.700)\n') for outline in keep_screen: outfile.write(outline) - outfile.write('STOP\n'+ - 'SCREEN (FROM VDWSCL= 0.700)\n') + outfile.write('STOP\nSCREEN (FROM VDWSCL= 0.700)\n') for outline in keep_screen2: outfile.write(outline) - outfile.write('STOP\n'+ - ' $END\n') + outfile.write('STOP\n $END\n') if __name__ == "__main__": - #main() - main(sys.argv[1], sys.argv[2]) \ No newline at end of file + # Execute the main function using command-line arguments. + # Example execution: python cut_qm.py ala_33_473.inp a0001.efp + main(sys.argv[1], sys.argv[2]) diff --git a/examples/flex-EFP/Scripts/fragment_RMSD.py b/examples/flex-EFP/Scripts/fragment_RMSD.py index 95b2fa3..9e58ac4 100644 --- a/examples/flex-EFP/Scripts/fragment_RMSD.py +++ b/examples/flex-EFP/Scripts/fragment_RMSD.py @@ -4,132 +4,252 @@ @author: jackl """ + import sys import numpy as np import os -directory = '/depot/lslipche/data/yb_boss/flexible_efp/efpdb/' -ang_cutoff=0.20 # Minimum RMSD allowed for "good" match -cutoff=ang_cutoff*1.8897259886 # Cutoff convert to Bohr -inp=sys.argv[1] -#inp='g_952.inp' # a_22_304.inp for example. Input file with fragment coords -with open(inp, "r") as orig: - orgs = orig.readlines() - -outname=inp.replace('.inp','.efp') # Output file name, same as input but changed extension. - # If no match is found, then outname will not be used. + + +# Directory containing the library files. CHANGE####################################### +base_directory = '/depot/lslipche/data/yb_boss/flexible_efp/efpdb/' + +ang_cutoff = 0.20 # Minimum RMSD allowed for a "good" match (in Angstroms) +bohr_cutoff = ang_cutoff * 1.8897259886 # Convert Angstrom cutoff to Bohr + +#Add to these if there are more +amino_acid_dict = { + 'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': 'asp', 'c': 'cys', + 'q': 'gln', 'e': 'glu', 'g': 'gly', 'h': 'hip', 'i': 'ile', + 'l': 'leu', 'k': 'lys', 'm': 'met', 'f': 'phe', 'p': 'pro', + 's': 'ser', 't': 'thr', 'w': 'trp', 'y': 'tyr', 'v': 'val', + 'hp': 'hip', 'hd': 'hid', 'he': 'hie' +} + +# "M" below is Mg. If atoms are present in your system but not defined here +# you will need to add them here +atom_weights = { + 'H': 1.0, 'O': 15.9949100, 'C': 12.0000000, + 'N': 12.0030700, 'S': 32.0650000, 'M': 23.3040000 +} + + +# Get input filename from command-line argument +inp = sys.argv[1] +# For testing, you might use a fixed input file: +# inp = 'g_952.inp' # e.g., a_22_304.inp. Input file with fragment coordinates + +# Read input file +with open(inp, "r") as orig_file: + input_lines = orig_file.readlines() + +# If good case found, output file name will be the same but .efp instead of .inp +# (Note: This file is only used if a match is found.) +outname = inp.replace('.inp', '.efp') + + + def get_RMSD(coords1,coords2,mass): + """ + Compute the weighted root-mean-square deviation (RMSD) between two sets of coordinates. + + Parameters: + coords1, coords2: 2D lists or arrays containing coordinate data. + weights: List of weights for each coordinate point. + + Returns: + Weighted RMSD. + """ tot=0.0 length=len(coords1) dim=len(coords1[0]) totmass=0.0 + for i in range(length): - for j in range(dim): - tot+=mass[i]*(coords1[i][j]-coords2[i][j])**2 + #Sum distances for each atom, weighted by atomic mass + tot+=mass[i]*sum([(coords1[i][j]-coords2[i][j])**2.0 for j in range(dim)]) totmass+=mass[i] + + # Normalized RMSD calculation. rmsd=np.sqrt(tot)/np.sqrt(totmass) return rmsd def kabsch_algorithm(coords, coords2): - + """ + Compute the optimal rotation matrix using the Kabsch algorithm to align coords onto coords2 + + Parameters: + coords: Coordinates to be rotated + coords2: Reference coordinates + + Returns: + rotation_matrix: Optimal rotation matrix. + com1: Center-of-mass (mean) of the first coordinates + com2: Center-of-mass (mean) of the second coordinates + """ + # Compute centers of mass com1 = np.mean(coords, axis=0) com2 = np.mean(coords2, axis=0) - coords1_aligned = coords - com1 - coords2_aligned=coords2 - com2 + # Center the coordinates + coords1_centered = coords - com1 + coords2_centered = coords2 - com2 - covariance_matrix = np.dot(coords1_aligned.T, coords2_aligned) - #print(covariance_matrix) + # Compute the covariance matrix + covariance_matrix = np.dot(coords1_centered.T, coords2_centered) + # Compute the SVD of the covariance matrix u, _, vh = np.linalg.svd(covariance_matrix) rotation_matrix = np.dot(u, vh) - + + #Check for unwanted reflections/make sure rotation is "right-handed" if np.linalg.det(rotation_matrix) < 0: rotation_matrix[:, -1] *= -1 + return rotation_matrix, com1, com2 def apply_transform(coords, rotation_matrix, com1, com2): + """ + Reads: + coords: Coordinates to transform + rotation_matrix: Rotation matrix computed from the Kabsch algorithm + com1: Center-of-mass of the original coordinates + com2: Center-of-mass of the target coordinates + + Returns: + Transformed coordinates + """ + # Translate coordinates to the origin translated_coords = coords - com1 - + # Rotate the translated coordinates. rotated_coords = np.dot(translated_coords, rotation_matrix) - rotated_coords+=com2 - return rotated_coords + # Translate to the target center of mass + transformed_coords = rotated_coords + com2 + return transformed_coords + + +amino_acid_dict = { + 'a': 'ala', 'r': 'arg', 'n': 'asn', 'd': 'asp', 'c': 'cys', + 'q': 'gln', 'e': 'glu', 'g': 'gly', 'h': 'hip', 'i': 'ile', + 'l': 'leu', 'k': 'lys', 'm': 'met', 'f': 'phe', 'p': 'pro', + 's': 'ser', 't': 'thr', 'w': 'trp', 'y': 'tyr', 'v': 'val', + 'hp': 'hip', 'hd': 'hid', 'he': 'hie' +} + +atom_weights = { + 'H': 1.0, 'O': 15.9949100, 'C': 12.0000000, + 'N': 12.0030700, 'S': 32.0650000, 'M': 23.3040000 +} -amino_acid_dict = {'a':'ala','r':'arg','n':'asn','d':'asp','c':'cys', - 'q':'gln','e':'glu','g':'gly','h':'hip','i':'ile', - 'l':'leu','k':'lys','m':'met','f':'phe','p':'pro', - 's':'ser','t':'thr','w':'trp','y':'tyr','v':'val', - 'hp':'hip','hd':'hid','he':'hie'} -atom_weights = {'H':1.0,'O':15.9949100,'C':12.0000000,'N':12.0030700,'S':32.0650000,'M':23.3040000} + + +# Extract the residue code from the input filename (assumes filename starts with a residue code) try: - res=amino_acid_dict[inp.split('_')[0]] -except: - print('No library folder for: '+inp) - exit() + res = amino_acid_dict[inp.split('_')[0]] +except KeyError: + print('No library folder for: ' + inp) + sys.exit() + +directory = os.path.join(base_directory, res) -directory+=res +# Define the file extension for matching EFP files. +file_extension = '.efp' -file_extension = '.efp' # change this to your desired file extension +# -------------------------- +# Read coordinates from the input file +# -------------------------- +# (1 Angstrom = 0.529177249 Bohr) +ANGSTROM_TO_BOHR = 0.529177249 -#Grab coordinates from the input file, convert from Angstroms to bohrs. -start=0 -xyz=[] -orig_coords=[] -weights=[] -inp_at_lines=[] -for line in orgs: +orig_coords = [] # list of [x, y, z] coordinates in Bohr +weights = [] # list of atomic weights corresponding to each coordinate + +reading_coords = False +for line in input_lines: if '$end' in line: - start=0 - elif(start==1): - inp_at_lines.append(line) - xyz.append(float(line.split()[2])/0.529177249) - xyz.append(float(line.split()[3])/0.529177249) - xyz.append(float(line.split()[4])/0.529177249) - orig_coords.append(xyz) - xyz=[] - weights.append(atom_weights[line.split()[0][0]]) + reading_coords = False + elif reading_coords: + parts = line.split() + # Convert coordinates from Angstroms to Bohr + x = float(parts[2]) / ANGSTROM_TO_BOHR + y = float(parts[3]) / ANGSTROM_TO_BOHR + z = float(parts[4]) / ANGSTROM_TO_BOHR + + orig_coords.append([x, y, z]) + # Use the first character of the atom type to lookup the weight + try: + weights.append(atom_weights[parts[0][0]]) + except: + print('Atom '+parts[0]+' is not defined. Define it and its atomic mass in `atom_weights`.') elif 'C1' in line: - start=1 -minrmsd=20.0 # Arbitrary starting value. This will be replaced with real RMSD values in the loop. + # Start reading coordinates after encountering 'C1' + reading_coords = True -# Loop through files in the directory +# -------------------------- +# Loop through EFP fragment files in the specified directory +# -------------------------- + +min_rmsd = 20.0 # Initialize with a large value; will be updated if a better match is found. +match = None # Will store the file path of the best match + +# Loop over files with the specified extension in the directory. for filename in os.listdir(directory): - if filename.endswith(file_extension): - full_path = os.path.join(directory, filename) - with open(full_path, "r") as term: - terms = term.readlines() - else: - continue - start=0 - xyz=[] - term_coords=[] - for line in terms: + if not filename.endswith(file_extension): + continue # Skip files that do not match the extension + + full_path = os.path.join(directory, filename) + + # Read the candidate file. + with open(full_path, "r") as candidate_file: + candidate_lines = candidate_file.readlines() + + # Grab coordinates from the fragment file + # The coordinates are assumed to be in the section following the header "COORDINATES (BOHR)" + # and ending when a line containing 'BO21' is encountered, this is the standard name for the first "bond midpoint" + candidate_coords = [] + reading_candidate_coords = False + for line in candidate_lines: if 'BO21' in line: - start=0 - elif(start==1): - xyz.append(float(line.split()[1])) - xyz.append(float(line.split()[2])) - xyz.append(float(line.split()[3])) - term_coords.append(xyz) - xyz=[] + reading_candidate_coords = False + elif reading_candidate_coords: + parts = line.split() + x = float(parts[1]) + y = float(parts[2]) + z = float(parts[3]) + candidate_coords.append([x, y, z]) elif 'COORDINATES (BOHR)' in line: - start=1 - new_coords=np.array(term_coords) - coords2=np.array(orig_coords) - rotation_matrix, com1, com2 = kabsch_algorithm(new_coords, coords2) - aligned_new_coords = apply_transform(new_coords, rotation_matrix, com1,com2) - rmsd=(get_RMSD(aligned_new_coords,coords2,weights)) - - if(rmsd3): + border = 1 + qm = 0 + if len(line.split()) > 3: QMs.append(int(line.split()[3])) elif 'QM_atoms' in line: - qm=1 + qm = 1 return QMs, pol_rem + def cut_frag(head, tail): - """Calculate virtual coordinates for a fragment based on desired bond distance.""" - desired_dist = 1.07886 + """ + Calculate coordinates for a virutal hydrogen using a desired bond distance. + + Parameters: + head: Line containing the head atom information. + tail: Line containing the tail atom information. + + Returns: + A list with the virtual coordinate [x, y, z] for the H-atom. + """ + desired_dist = 1.07886 # Desired bond distance + # Multiply by 10 to scale the coordinates. xh, yh, zh = [float(head.split()[i]) * 10 for i in range(4, 7)] xt, yt, zt = [float(tail.split()[i]) * 10 for i in range(4, 7)] + # Calculate the magnitude of the distance vector. dist_mag = np.sqrt((xh - xt)**2 + (yh - yt)**2 + (zh - zt)**2) - + # Calculate the new coordinate at the desired distance. h_t = [ ((xt - xh) * desired_dist / dist_mag) + xh, ((yt - yh) * desired_dist / dist_mag) + yh, @@ -94,140 +143,211 @@ def cut_frag(head, tail): ] return h_t + def make_inp(fragment, QMs, POLs): - """Generate input file for a fragment.""" - txt = [] - num_virtuals=0 + """ + Generate an input (.inp) file for a fragment. + + Parameters: + fragment: List of lines containing atoms in the fragment. + QMs: List of QM atom names to be marked for removal. + POLs: List of atom names to have polarization points removed. + """ + lines_out = [] + num_virtuals = 0 + charge = 0 + + # Count virtual atoms (labeled as 'H000') for atom in fragment: if 'H000' in atom: - num_virtuals+=1 - if(len(QMs)+num_virtuals==len(fragment)): + num_virtuals += 1 + + # If all atoms are either QM or virtual atoms, do not write an input. + if len(QMs) + num_virtuals == len(fragment): return - if fragment[4].split()[1] in spec_AAs: - charge = AA_charge[fragment[4].split()[1]] - elif len(fragment[4].split()[1])==4: - if(fragment[4].split()[1][0]=='N'): - charge=1 - elif(fragment[4].split()[1][0]=='C'): - charge=-1 - else: - charge = 0 - - if fragment[4].split()[1] in known_amino_acids: - filename = amino_acid_dict[fragment[4].split()[1]] + '_' + fragment[4].split()[0] + '_' + fragment[0].split()[3] + '.inp' + + # Determine the charge based on the residue type. + res_info = fragment[4].split()[1] + if res_info in spec_AAs: + charge = AA_charge[res_info] + elif len(res_info) == 4: + # Assume N-terminal or C-terminal if the residue code starts with N or C. + if res_info[0] == 'N': + charge = 1 + elif res_info[0] == 'C': + charge = -1 + + # Construct the filename based on residue information. + #Use 1 letter identifier for Amino acids, otherwise filename will match residue name + if res_info in known_amino_acids: + filename = amino_acid_dict[res_info] + '_' + fragment[4].split()[0] + '_' + fragment[0].split()[3] + '.inp' else: - filename = fragment[4].split()[1].lower() + '_' + fragment[4].split()[0] + '_' + fragment[0].split()[3] + '.inp' - - txt.append(f" $contrl units=angs local=boys runtyp=makefp \n" - f" mult=1 icharg={charge} coord=cart icut=11 $end\n" - f" $system timlim=99999 mwords=200 $end\n" - f" $scf soscf=.f. dirscf=.t. diis=.t. CONV=1.0d-06 $end\n" - f" $basis gbasis=n31 ngauss=6 ndfunc=1 $end\n" - f" $DAMP IFTTYP(1)=2,0 IFTFIX(1)=1,1 thrsh=500.0 $end\n" - f" $MAKEFP POL=.t. DISP=.f. CHTR=.f. EXREP=.f. $end\n" - f" $data\n {filename.split('.')[0]}\n C1\n") - + filename = res_info.lower() + '_' + fragment[4].split()[0] + '_' + fragment[0].split()[3] + '.inp' + + # Build header sections for the input file. + # You may want to adjust these parameters. + lines_out.append( + f" $contrl units=angs local=boys runtyp=makefp \n" + f" mult=1 icharg={charge} coord=cart icut=11 $end\n" + f" $system timlim=99999 mwords=200 $end\n" + f" $scf soscf=.f. dirscf=.t. diis=.t. CONV=1.0d-06 $end\n" + f" $basis gbasis=n31 ngauss=6 ndfunc=1 $end\n" + f" $DAMP IFTTYP(1)=2,0 IFTFIX(1)=1,1 thrsh=500.0 $end\n" + f" $MAKEFP POL=.t. DISP=.f. CHTR=.f. EXREP=.f. $end\n" + f" $data\n {filename.split('.')[0]}\n C1\n" + ) + + # Process each atom line. for atom in fragment: if 'H000' in atom: - txt.append(atom) + # Virtual atoms are written as is. + lines_out.append(atom) else: - col1 = f" {atom.split()[2]}" - col1 = col1.ljust(6) - + # Format the atom line. + # First column: atom identifier from the fragment. + col1 = f" {atom.split()[2]}".ljust(6) + # Determine atomic number based on element symbol. + # Two-letter atomnames will need to be added here; this is a lazy solution. if atom.split()[2][0] == 'M': col2 = at_sym['MG'] else: col2 = at_sym[atom.split()[2][0]] - + # Get coordinates, scaling by 10. Then format x, y, z = [float(atom.split()[i]) * 10 for i in range(4, 7)] col3 = f"{x:.8f}".rjust(17) col4 = f"{y:.8f}".rjust(18) col5 = f"{z:.8f}".rjust(18) - - txt.append(f"{col1}{col2}{col3}{col4}{col5}\n") - - txt.append(" $end \n!comment atoms to be erased:") + lines_out.append(f"{col1}{col2}{col3}{col4}{col5}\n") + + # Append comments regarding atoms to be removed later. + lines_out.append(" $end \n!comment atoms to be erased:") for atomname in QMs: - txt.append(" "+atomname) - txt.append("\n") - txt.append("!polarization points to remove:") + lines_out.append(" " + str(atomname)) + lines_out.append("\n") + # Append comments regarding atoms that will have polarization removed. + lines_out.append("!polarization points to remove:") for atomname in POLs: - txt.append(" "+atomname) - - #" $end\n") - + lines_out.append(" " + str(atomname)) + + # Write the constructed lines to the file. with open(filename, 'w') as outfile: - outfile.writelines(txt) + outfile.writelines(lines_out) -def QM_MM_covalent(MM,QMs,topol_file): - with open(topol_file, 'r') as lines: - bonds=0 - bond_IDs=[] - for line in lines: + +def QM_MM_covalent(MM, QMs, topol_file): + """ + For a given MM atom that is covalently bound to a QM atom, find other MM atoms that are covalently bonded + to the given atom but are not in the QM region. + + This function parses the topology file for bonds and returns a list of atom IDs. + WARNING this script assumes matching atom IDs between the full structure .g96 and the topology .top files; + -the script cannot see how your topology is defined. + + Parameters: + MM: The MM atom index. + QMs: List of QM atom indices. + topol_file: Path to the topology file. + + Returns: + bond_IDs: List of MM atom indices covalently bonded to the given MM atom. + """ + bond_IDs = [] + bonds_section = False + with open(topol_file, 'r') as f: + for line in f: + # Identify the bonds section. if '[ bonds ]' in line: - bonds=1 + bonds_section = True continue - elif bonds and len(line.split())<2: + elif bonds_section and len(line.split()) < 2: + # End of bonds section. return bond_IDs - elif bonds and (line[0]==';'): + elif bonds_section and line.strip().startswith(';'): + # Skip comment lines. continue - elif bonds and (int(line.split()[0])==MM) and int(line.split()[1]) not in QMs: - bond_IDs.append(int(line.split()[1])) - elif bonds and (int(line.split()[1])==MM) and int(line.split()[0]) not in QMs: - bond_IDs.append(int(line.split()[0])) - - - -#Gather from user which atoms are QM region and which are MM covalently bound to QM + elif bonds_section: + atom1 = int(line.split()[0]) + atom2 = int(line.split()[1]) + # If one atom is given MM and the other is not in the QM region, add the non‐QM atom. + if atom1 == MM and atom2 not in QMs: + bond_IDs.append(atom2) + elif atom2 == MM and atom1 not in QMs: + bond_IDs.append(atom1) + return bond_IDs + +# --------------------------- +# Main Script Processing +# --------------------------- + +# Determine QM region atoms and polarization removal atoms from the settings file. qm_IDs, MM_remove = qm_atoms(settings_file) -#Take MM atoms that are covalently bound to QM region, then find the other MM atoms bound to that -#border atom (MM-MM-QM bonds, looking for the first MM atom). These atoms will have polarization points -#removed. The middle MM atom will be removed entirely to avoid overlap with QM virtual hydrogens. -pol_remove=[] +# For each MM atom covalently bound to the QM region, find the other MM atoms bound +# to it. These atoms will have polarization points removed. +pol_remove = [] for atom in MM_remove: - found_pol_remove=QM_MM_covalent(atom,qm_IDs,topol_file) + found_pol_remove = QM_MM_covalent(atom, qm_IDs, topol_file) for bound_atom in found_pol_remove: pol_remove.append(bound_atom) -# Process EFP shell residue IDs and names +# --------------------------- +# Process EFP Region Residues +# --------------------------- +# Get a list of unique residue IDs from the EFP file. +# Note that atom IDs do not match between EFP and full structures. However, residue IDs do match efp_resis = [] prevres = '0' -start = 0 - -for line in f0: - if start == 1: - if(len(line.split())<3): +start = False +for line in efp_lines: + if start: + if len(line.split()) < 3: break - if(line.split()[0]!=prevres): + #only count residues once + if line.split()[0] != prevres: efp_resis.append(line.split()[0]) prevres = line.split()[0] if 'POSITION' in line: - start = 1 + start = True + +# --------------------------- +# Process Fragments from Full Configuration File +# --------------------------- +i = 0 # Index for current EFP residue +prev_co = [] # List to store previous "C" or "O" atom lines; these will be 1 residue number "off" from .g96 convention +frag = [] # List to append all fragment lines +CAs = [] # List to accumulate "CA" (alpha carbon) lines +start = False -# Fragment processing and input generation -i = 0 -prev_co = [] -frag = [] -CAs = [] -end = 0 -start = 0 +#"C", "O", and "CA" atoms are significant. We redefine residues using these atom names -for line in g0: +for line in full_lines: if 'END' in line: - if(start==1): + if start: break - elif 'SOL' in line or 'QSL' in line: + #elif 'SOL' in line: + elif 'SOL' in line or 'QSL' in line: #Sol is standard water, QSL is my QM region water, this is not standardized continue - elif start == 1: + elif start: + # Collect CA atoms. if line.split()[2] == 'CA': CAs.append(line) + # Collect "O" atoms. elif line.split()[2] == 'O': prev_co.append(line) + # Process "C" atoms. This is the "end" of a residue as we would like to define it. elif line.split()[2] == 'C': prev_co.append(line) - if(line.split()[0]==efp_resis[i]): - if len(line.split()[1]) == 4: + # If the current line belongs to the current EFP residue. + if line.split()[0] == efp_resis[i]: + # standard amino acid fragments (non-terminals, non-cofactors) need two virtual hydrogens. + if line.split()[1] in known_amino_acids: + vH1 = cut_frag(frag[0], CAs[-2]) #Cut between current C and previous CA + vH2 = cut_frag(CAs[-1], line) #Cut between current CA and next C + frag.append(f" H000 1.0 {vH1[0]:.8f} {vH1[1]:.8f} {vH1[2]:.8f}\n") + frag.append(f" H000 1.0 {vH2[0]:.8f} {vH2[1]:.8f} {vH2[2]:.8f}\n") + # If the residue identifier (field 1) has length 4 (a non-standard amino acid), + # determine if it is N-terminal or C-terminal. + elif len(line.split()[1]) == 4: if line.split()[1][1:4] in known_amino_acids: if line.split()[1][0] == 'N': # N-terminal residue vH2 = cut_frag(CAs[-1], line) @@ -235,46 +355,44 @@ def QM_MM_covalent(MM,QMs,topol_file): elif line.split()[1][0] == 'C': # C-terminal residue vH1 = cut_frag(frag[0], CAs[-2]) frag.append(f" H000 1.0 {vH1[0]:.8f} {vH1[1]:.8f} {vH1[2]:.8f}\n") - elif line.split()[1] in known_amino_acids: - #print(frag) - vH1 = cut_frag(frag[0], CAs[-2]) - vH2 = cut_frag(CAs[-1], line) - frag.append(f" H000 1.0 {vH1[0]:.8f} {vH1[1]:.8f} {vH1[2]:.8f}\n") - frag.append(f" H000 1.0 {vH2[0]:.8f} {vH2[1]:.8f} {vH2[2]:.8f}\n") - + # If there is enough fragment data, determine which atoms to remove. if len(frag) > 1: - qm_names=[] - pol_names=[] + qm_names = [] + pol_names = [] for atom in frag: - if 'H000' in atom: + if 'H000' in atom: #All virtual hydrogens will be removed, no need to specify in comment continue elif int(atom.split()[3]) in qm_IDs: qm_names.append(atom.split()[2]) elif int(atom.split()[3]) in pol_remove: pol_names.append(atom.split()[2]) - make_inp(frag,qm_names,pol_names) - frag = [] - i += 1 - if(len(frag)>1) and (frag[-1].split()[0])==efp_resis[i]: - if(frag[0].split()[0]!=line.split()[0]) and frag[0].split()[1] not in known_amino_acids: + make_inp(frag, qm_names, pol_names) + frag = [] # Reset fragment lines + i += 1 # Move to the next EFP residue. + # Check if the fragment is complete (with more than one line) + # Check that this frag is the current EFP residue (and end of that residue) + # Check that this frag is not an amino acid (is a cofactor, lipid, etc) [redundant] + # No virtual hydrogens are added here + if len(frag) > 1 and frag[-1].split()[0] == efp_resis[i]: + if (frag[0].split()[0] != line.split()[0]) and frag[0].split()[1] not in known_amino_acids: if frag[0].split()[1][1:] not in known_amino_acids: - qm_names=[] - pol_names=[] + qm_names = [] + pol_names = [] for atom in frag: if int(atom.split()[3]) in qm_IDs: qm_names.append(atom.split()[2]) elif int(atom.split()[3]) in pol_remove: pol_names.append(atom.split()[2]) - make_inp(frag,qm_names,pol_names) + make_inp(frag, qm_names, pol_names) frag = [] i += 1 - + # If the current line belongs to the current EFP residue, add it to the fragment. if line.split()[0] == efp_resis[i]: if len(prev_co) > 1 and line.split()[1] in known_amino_acids: - #frag.extend(prev_co) + # Add the previous two "C" or "O" atoms to the fragment if the current fragment is an amino acid frag.append(prev_co[-2]) frag.append(prev_co[-1]) prev_co = [] frag.append(line) elif 'POSITION' in line: - start = 1 + start = True \ No newline at end of file diff --git a/examples/flex-EFP/Scripts/make_bcls.py b/examples/flex-EFP/Scripts/make_bcls.py index 441f599..95d960c 100644 --- a/examples/flex-EFP/Scripts/make_bcls.py +++ b/examples/flex-EFP/Scripts/make_bcls.py @@ -4,125 +4,245 @@ @author: jackl -reads in .g96 of efp region and creates .inp files for bacteriochlorophyl a molecules. -Head and tail groups are separate fragments. +This script reads in a .g96 file of the EFP region and a full configuration .g96 file, +and then creates .inp files for bacteriochlorophyll a molecules. In this process, the +head and tail groups are treated as separate fragments. + +This is a "hard-coded" script, that is, this script is very specific to chlorophylls +or bacteriochlorophylls (specifically chloropyll a and bacteriochlorophyll a). +If you intend to separate a molecule into multiplefragment .efp files, you will +need to edit this script. """ import numpy as np import sys -#site='359' -g96_file=sys.argv[1] -#g96_file='efp_pair53004.g96' -#site=sys.argv[2] - -tailside='C6' -headside='C5' -site=['752','754'] -oldres='752' - -def cut_frag(head,tail): - desired_dist=1.07886 - xh=float(head.split()[4])*10 - yh=float(head.split()[5])*10 - zh=float(head.split()[6])*10 - xt=float(tail.split()[4])*10 - yt=float(tail.split()[5])*10 - zt=float(tail.split()[6])*10 - dist_mag=np.sqrt(((xh-xt)**2)+((yh-yt)**2)+((zh-zt)**2)) - #print(dist_mag,desired_dist) - t_h=[((xh-xt)*desired_dist/dist_mag)+xt,((yh-yt)*desired_dist/dist_mag)+yt,((zh-zt)*desired_dist/dist_mag)+zt] - h_t=[((xt-xh)*desired_dist/dist_mag)+xh,((yt-yh)*desired_dist/dist_mag)+yh,((zt-zh)*desired_dist/dist_mag)+zh] - return h_t,t_h +# --------------------------- +# Input files and cutting definitions +# --------------------------- +# Command-line arguments: +# Sample execution: python make_bcls_V2.py efp_opt_83855.g96 optimized_83855.g96 +g96_file = sys.argv[1] # EFP region file (e.g., "efp_opt_83855.g96") +full_g96_file = sys.argv[2] # Full configuration file (e.g., "optimized_83855.g96") + +# Settings for fragment splitting. the bond between atoms C5 and C6 is where the fragments split. +tailside = 'C6' # Atom name defining the tail side boundary +headside = 'C5' # Atom name defining the head side boundary + +# List of QM residue IDs. If you want all CLA/BCL residues to be made into fragments, make this empty +site = ['752', '754', '753', '790', '792', '793'] +oldres = '751' # Initial residue number for tracking changes + +# --------------------------- +# Function Definitions +# --------------------------- +def cut_frag(head, tail): + """ + This function uses the coordinates from two atoms (head and tail) to compute + the positions for adding virtual hydrogens to cap the now-separated fragments + (along the vector that is the C5-C6 bond). + + Parameters: + head: A string line with the head atom lines. + tail: A string line with the tail atom lines. + + Returns: + h_t: coordinates [x, y, z] for the virtual hydrogen on the head side. + t_h: coordinates [x, y, z] for the virtual hydrogen on the tail side. + """ + desired_dist = 1.07886 # Desired bond distance + # Scale coordinates by 10 (nm -> angstrom) + xh, yh, zh = [float(head.split()[i]) * 10 for i in range(4, 7)] + xt, yt, zt = [float(tail.split()[i]) * 10 for i in range(4, 7)] + # Calculate the magnitude of the distance vector between head and tail. + dist_mag = np.sqrt((xh - xt)**2 + (yh - yt)**2 + (zh - zt)**2) + # Compute the new coordinates along the head-to-tail vector. + h_t = [ + ((xt - xh) * desired_dist / dist_mag) + xh, + ((yt - yh) * desired_dist / dist_mag) + yh, + ((zt - zh) * desired_dist / dist_mag) + zh + ] + # Compute the new coordinates along the tail-to-head vector. + t_h = [ + ((xh - xt) * desired_dist / dist_mag) + xt, + ((yh - yt) * desired_dist / dist_mag) + yt, + ((zh - zt) * desired_dist / dist_mag) + zt + ] + return h_t, t_h def make_inp(fragment): - txt=[] - filename='cla_t_'+fragment[0].split()[0]+'.inp' - txt.append(' $contrl units=angs local=boys runtyp=makefp \n'+ -' mult=1 icharg=0 coord=cart icut=11 $end\n'+ -' $system timlim=99999 mwords=200 $end\n'+ -' $scf soscf=.f. dirscf=.t. diis=.t. CONV=1.0d-06 $end\n'+ -' $basis gbasis=n31 ngauss=6 ndfunc=1 $end\n'+ -' $DAMP IFTTYP(1)=2,0 IFTFIX(1)=1,1 thrsh=500.0 $end\n'+ -' $MAKEFP POL=.t. DISP=.f. CHTR=.f. EXREP=.f. $end\n'+ -' $data\n'+ -' '+filename.split('.')[0]+'\n'+ -' C1\n') + """ + Generate an input (.inp) file for a fragment of bacteriochlorophyll a. + The filename is based on the fragment's residue number and atom ID. + The file includes various control sections and a coordinate list. + + Parameters: + fragment: Lines of atoms comprising the fragment. + """ + txt = [] + # Extract residue number and atom ID from the first line of the fragment. + resnum = fragment[0].split()[0] + atomid = fragment[0].split()[3] + + # Choose a filename prefix based on whether the first fragment line contains 'MG' + if 'MG' in fragment[0]: + namestart = 'clah_' + else: + namestart = 'clat_' + filename = namestart + resnum + '_' + atomid + '.inp' + + # Construct the header for the input file. + # You may want to customize these paramaters. + header = ( + " $contrl units=angs local=boys runtyp=makefp \n" + " mult=1 icharg=0 coord=cart icut=11 $end\n" + " $system timlim=99999 mwords=4000 $end\n" + " $scf soscf=.f. dirscf=.t. diis=.t. CONV=1.0d-06 $end\n" + " $basis gbasis=n31 ngauss=6 ndfunc=1 $end\n" + " $DAMP IFTTYP(1)=2,0 IFTFIX(1)=1,1 thrsh=500.0 $end\n" + " $MAKEFP POL=.t. DISP=.f. CHTR=.f. EXREP=.f. $end\n" + " $data\n" + " " + filename.split('.')[0] + "\n" + " C1\n" + ) + txt.append(header) + + # Append each atom line. for atom in fragment: if 'H000' in atom: + # Virtual hydrogen atoms are taken as is. txt.append(atom) else: - if(atom.split()[2][0]=='M'): - col1=' '+atom.split()[2][0:2]+atom.split()[3]+' ' - col2=at_sym['MG'] - filename='cla_h_'+atom.split()[0]+'.inp' + # Format the lines. + col1 = f" {atom.split()[2]}".ljust(6) + # Determine the atomic number using the at_sym dictionary. + # WARNING: Atoms with >1 letter symbols must be added if you have them. + if atom.split()[2][0] == 'M': + col2 = at_sym['MG'] else: - col1=' '+atom.split()[2][0]+atom.split()[3]+' ' - col2=at_sym[atom.split()[2][0]] - x=float(atom.split()[4])*10 - #col3=(str(x)) - col3=str("{:.8f}".format(x)) - while(len(col3)<17): - col3=' '+col3 - y=float(atom.split()[5])*10 - col4=str("{:.8f}".format(y)) - while(len(col4)<18): - col4=' '+col4 - z=float(atom.split()[6])*10 - col5=str("{:.8f}".format(z)) - while(len(col5)<18): - col5=' '+col5 - txt.append(col1+col2+col3+col4+col5+'\n') - txt.append(' $end \n'+ -' $comment Atoms to be erased: $end\n') - with open(filename,'w') as outfile: - for line1 in txt: - outfile.write(line1) - -at_sym={'H':'1.0','C':'6.0','N':'7.0','O':'8.0','MG':'12.0','P':'15.0','S':'16.0','FE':'26.0', - 'NA':'11.0','CL':'17.0' - } - -Rings=['MG','CHA','CHB','CHC','CHD','NA','C1A','C2A','C3A','C4A','CMA','NB','C1B','C2B','C3B', - 'C4B','CMB','CAB','CBB','NC','C1C','C2C','C3C','C4C','CMC','CAC','CBC','ND','C1D','C2D', - 'C3D','C4D','CMD','CAD','OBD','CBD','CGD','O1D','O2D','CED','CAA','CBA','CGA','O1A', - 'O2A','C1','C2','C3','C4','C5','H1','H2','H3','H4','H5','H6','H7','H8','H9','H10','H11', - 'H12','H13','H14','H15','H16','H17','H18','H19','H20','H21','H22','H23','H24','H25', - 'H26','H27','H28','H29','H30','H31','H32','H33','H34','H35','H36','H37','H38','H39', - 'H40','H41'] - -#with open('bchl359-50028.g96','r') as g96: -# g0=g96.readlines() - -with open(g96_file,'r') as g96: - g0=g96.readlines() + col2 = at_sym[atom.split()[2][0]] + # nm -> angstrom. + x, y, z = [float(atom.split()[i]) * 10 for i in range(4, 7)] + col3 = f"{x:.8f}".rjust(17) + col4 = f"{y:.8f}".rjust(18) + col5 = f"{z:.8f}".rjust(18) + txt.append(f"{col1}{col2}{col3}{col4}{col5}\n") + + # Append comments regarding atoms to be removed. + txt.append(" $end \n!comment atoms to be erased:") + # The QM region might include the entire chlorophyll or just the headring. + # In case of only headring, a tail fragment must still be created, and + # C6 is an MM atom covalently bound to C5, a QM atom. + if fragment[0].split()[0] in site: + txt.append(" C6") + txt.append("\n") + txt.append("!polarization points to remove:") + if fragment[0].split()[0] in site: + #H42, H43, C7 are bound to C6, polarizability must be romoved + txt.append(" H42 H43 C7") -#tailside and headside represent the atoms that are bound but are to be broken to create -#separate fragment files. -curr_head=[] -curr_tail=[] -#BCLs=[] -for line in g0: + # Write the output to a file. + with open(filename, 'w') as outfile: + outfile.writelines(txt) + +# --------------------------- +# Global Dictionaries and Lists +# --------------------------- +# Dictionary to format atomic symbols to numbers (for output). +at_sym = { + 'H': '1.0', 'C': '6.0', 'N': '7.0', 'O': '8.0', 'MG': '12.0', + 'P': '15.0', 'S': '16.0', 'FE': '26.0', 'NA': '11.0', 'CL': '17.0' +} + +# List of headring atoms. List includes names for YB's BCL and Reppert group's CLA. +Rings = [ + 'MG','CHA','CHB','CHC','CHD','NA','C1A','C2A','C3A','C4A','CMA','NB', + 'C1B','C2B','C3B','C4B','CMB','CAB','CBB','NC','C1C','C2C','C3C','C4C', + 'CMC','CAC','CBC','ND','C1D','C2D','C3D','C4D','CMD','CAD','OBD','CBD', + 'CGD','O1D','O2D','CED','CAA','CBA','CGA','O1A','O2A','C1','C2','C3','C4', + 'C5','H1','H2','H3','H4','H5','H6','H7','H8','H9','H10','H11','H12','H13', + 'H14','H15','H16','H17','H18','H19','H20','H21','H22','H23','H24','H25', + 'H26','H27','H28','H29','H30','H31','H32','H33','H34','H35','H36','H37', + 'H38','H39','H40','H41' +] + +# --------------------------- +# File Reading and Fragment Identification +# --------------------------- +# Read the EFP region file. +with open(g96_file, 'r') as efp: + efp_lines = efp.readlines() + +# Read the full configuration file. +with open(full_g96_file, 'r') as g96: + full_lines = g96.readlines() + +# Identify residue numbers (from lines containing 'CLA') in the EFP file that are not in the 'site' list. +frag_cla_resnums = [] +for line in efp_lines: if 'CLA' in line: - if(oldres!=line.split()[0]): - head_coord,tail_coord = cut_frag(head_cut,tail_cut) - curr_head.append(' H000 1.0 '+str("{:.8f}".format(head_coord[0]))+' '+str("{:.8f}".format(head_coord[1]))+' '+str("{:.8f}".format(head_coord[2]))+'\n') - curr_tail.append(' H000 1.0 '+str("{:.8f}".format(tail_coord[0]))+' '+str("{:.8f}".format(tail_coord[1]))+' '+str("{:.8f}".format(tail_coord[2]))+'\n') - #if(oldres!=site): + res_num = line.split()[0] + if res_num in frag_cla_resnums: + continue + elif res_num not in site: + frag_cla_resnums.append(res_num) + +# --------------------------- +# Fragment Processing for Head and Tail Groups +# --------------------------- +# 'tailside' and 'headside' are the atoms where bonds are to be broken to create +# separate fragment files. +curr_head = [] # Append lines for the "head" fragment +curr_tail = [] # append lines for the "tail" fragment +head_cut = '' # Temporary storage for the head atom used for virtual bond creation + +# Process each line in the full configuration file. +for line in full_lines: + if 'CLA' in line: + # Process only for fragments with residue numbers in our list. + if line.split()[0] in frag_cla_resnums and (oldres != line.split()[0]): + # If no head_cut has been found, update oldres and continue. + if len(head_cut) < 3: + oldres = line.split()[0] + continue + # Once both a head and tail line have been recorded, compute virtual hydrogen positions. + head_coord, tail_coord = cut_frag(head_cut, tail_cut) + # Append the virtual hydrogen lines to the corresponding fragment lists. + curr_head.append( + f" H000 1.0" + + f"{head_coord[0]:.8f}".rjust(17) + + f"{head_coord[1]:.8f}".rjust(18) + + f"{head_coord[2]:.8f}".rjust(18) + "\n" + ) + curr_tail.append( + f" H000 1.0" + + f"{tail_coord[0]:.8f}".rjust(17) + + f"{tail_coord[1]:.8f}".rjust(18) + + f"{tail_coord[2]:.8f}".rjust(18) + "\n" + ) + # Reset head_cut for the next fragment. + head_cut = '' + # Depending on whether the residue is in the 'site' list, write both fragments or only the tail. if oldres not in site: make_inp(curr_head) make_inp(curr_tail) - curr_head=[] - curr_tail=[] - oldres=line.split()[0] - atomname=line.split()[2] - if atomname in Rings: - curr_head.append(line) - else: - curr_tail.append(line) - if(atomname==tailside): - tail_cut=line - elif(atomname==headside): - head_cut=line + else: + make_inp(curr_tail) + # Reset the current fragment accumulators. + curr_head = [] + curr_tail = [] + oldres = line.split()[0] - \ No newline at end of file + # Get the current atom name. + atomname = line.split()[2] + # Append the line into the desired fragment based on the Rings list. + if line.split()[0] in frag_cla_resnums: + if atomname in Rings: + curr_head.append(line) + else: + curr_tail.append(line) + # Save the line for later use if it matches the headside or tailside. + if atomname == tailside: + tail_cut = line + elif atomname == headside: + head_cut = line diff --git a/examples/flex-EFP/Scripts/make_mm.py b/examples/flex-EFP/Scripts/make_mm.py index acd1978..a83a872 100644 --- a/examples/flex-EFP/Scripts/make_mm.py +++ b/examples/flex-EFP/Scripts/make_mm.py @@ -3,165 +3,332 @@ Created on Wed Sep 18 12:28:23 2024 @author: jackl + +This script reads in EFP region file (.g96), a full structure file (.g96), and a topology file (.top or .itp) +to extract MM (molecular mechanics) coordinates, charges, and screening parameters.The extracted information +is then written to an output file ("prot.efp"). """ import sys -water_and_ions={'SOL':-0.834,'CL':-1.0,'NA':1.0} +# Global dictionaries and lists + +# 'SOL' returns the charge of oxygen in the water model (TIP3P). H charge is taken to be -1/2 * oxygen charge. +# Change these for different water models. +water_and_ions = { + 'SOL': -0.834, + 'CL': -1.0, + 'NA': 1.0 +} + +#All non-amino acid residue names should be in this list +known_cofactors = ['ECH', '45D', 'EQ3', 'C7Z', 'CLA', 'PQN', 'BCR', 'QLA', 'LHG', 'LMG', 'SQD', 'LMT'] -def get_EFPs(efplines): - efp_resis=[] - start=0 - prevres='0' - for line in efplines: + +def get_EFPs(efp_lines): + """ + Grab the EFP file lines to extract unique residue IDs and names. + + Parameters: + efp_lines (list of str): Lines from an EFP .g96 file. + + Returns: + efp_resis (list of [resid, resname]): A list of residue information. + """ + efp_resis = [] + start = False + prev_res = None + for line in efp_lines: if 'END' in line: - if(len(efp_resis)>1): + if len(efp_resis) > 1: return efp_resis - elif start == 1: - if(line.split()[0]!=prevres): - resid=line.split()[0] - resname=line.split()[1] - efp_resis.append([resid,resname]) - prevres = resid + elif start: + # When in the POSITION block, add a new residue if different from previous. + parts = line.split() + if parts and (parts[0] != prev_res): + resid = parts[0] + resname = parts[1] + efp_resis.append([resid, resname]) + prev_res = resid if 'POSITION' in line: - start = 1 - -def get_MM_coords(mm_indexes,g96): - MMs=[] - temp_MMs=[] - prev_MM=0 - i=0 - prev_resid=0 - for line in g96: - if(len(line.split())<4) or (line[0]!=' '): + start = True + return efp_resis + + +def get_MM_coords(mm_indexes, g96_lines): + """ + Extract MM atom coordinates from a full configuration .g96 file. + + For each residue (given by mm_indexes), this function extracts coordinates + for the MM atoms. A conversion factor (18.897161646321) converts nm -> Bohr. + + Parameters: + mm_indexes (list of [resid, resname]): Residues to process. + g96_lines (list of str): Lines from the full configuration .g96 file. + + Returns: + MMs (list of str): Formatted coordinate lines. + """ + MMs = [] + temp_MMs = [] + prev_MM_flag = 0 + index = 0 + prev_resid = None + conversion = 18.897161646321 # nm -> Bohr + + for line in g96_lines: + parts = line.split() + if len(parts) < 4 or line[0] != ' ': continue - elif(line.split()[0]!=prev_resid) and (prev_MM==1): - i+=1 - if(line.split()[2]=='C') or (line.split()[2]=='O'): - col1=line.split()[2][0]+line.split()[3] - while(len(col1)<7): - col1+=' ' - col2=float(line.split()[4])*18.897161646321 - col3=float(line.split()[5])*18.897161646321 - col4=float(line.split()[6])*18.897161646321 - temp_MMs.append(col1+'%20.12f'%col2+'%20.12f'%col3+'%20.12f'%col4+'\n') + + # If residue changes and a MM atom was processed, advance the index. + if parts[0] != prev_resid and prev_MM_flag == 1: + index += 1 + + # Process lines with atom type "C" or "O"; we do not know yet if these are needed. + if parts[2] in ('C', 'O'): + # Build a formatted atom label: first letter of atom type + atom ID (from field 3) + col1 = parts[2][0] + parts[3] + # Pad the label to length 7. + col1 = col1.ljust(7) + # Convert coordinates with the conversion factor. + ''' + col2 = float(parts[4]) * conversion + col3 = float(parts[5]) * conversion + col4 = float(parts[6]) * conversion + temp_MMs.append(col1 + '%20.12f' % col2 + '%20.12f' % col3 + '%20.12f' % col4 + '\n') + ''' + x, y, z = [float(parts[i]) * conversion for i in range(4, 7)] + col2 = f"{x:.12f}".rjust(20) + col3 = f"{y:.12f}".rjust(20) + col4 = f"{z:.12f}".rjust(20) + temp_MMs.append(f"{col1}{col2}{col3}{col4}\n") + else: - if(line.split()[0]==mm_indexes[i][0]) and (line.split()[1]==mm_indexes[i][1]): - if(len(temp_MMs)>1): + # For non-"C"/"O" lines, check if the residue matches the expected mm_indexes information. + if parts[0] == mm_indexes[index][0] and parts[1] == mm_indexes[index][1]: + if len(temp_MMs) > 1: + # Append the last two entries from the temporary list, only the most recent "C" and "O" MMs.append(temp_MMs[-2]) MMs.append(temp_MMs[-1]) - temp_MMs=[] - col1=line.split()[2][0]+line.split()[3] - while(len(col1)<7): - col1+=' ' - col2=float(line.split()[4])*18.897161646321 - col3=float(line.split()[5])*18.897161646321 - col4=float(line.split()[6])*18.897161646321 - MMs.append(col1+'%20.12f'%col2+'%20.12f'%col3+'%20.12f'%col4+'\n') - prev_MM=1 + temp_MMs = [] + # Append current atom + col1 = parts[2][0] + parts[3] + col1 = col1.ljust(7) + x, y, z = [float(parts[i]) * conversion for i in range(4, 7)] + col2 = f"{x:.12f}".rjust(20) + col3 = f"{y:.12f}".rjust(20) + col4 = f"{z:.12f}".rjust(20) + MMs.append(f"{col1}{col2}{col3}{col4}\n") + prev_MM_flag = 1 else: - prev_MM=0 - prev_resid=line.split()[0] + prev_MM_flag = 0 + prev_resid = parts[0] return MMs -def get_MM_charges(mm_indexes,topol): - MMs=[] - temp_MMs=[] - prev_MM=0 - i=0 - prev_resid=0 - for line in topol: +def get_MM_charges(mm_indexes, topol_lines): + """ + Extract MM atom charges from the topology file. + Charges for water ("SOL") and ions are assigned from the water_and_ions dictionary. + For each residue (from mm_indexes) the corresponding charge information is appended. + + Parameters: + mm_indexes (list of [resid, resname]): Residues to process. + topol_lines (list of str): Lines from the topology file. + + Returns: + MMs (list of str): Formatted charge lines. + """ + MMs = [] + temp_MMs = [] + prev_MM_flag = 0 + index = 0 + prev_resid = None + + for line in topol_lines: + # When reaching the [ bonds ] section, finish processing by adding charges for remaining mm_indexes. if '[ bonds ]' in line: - atomid=int(MMs[-1].split()[0][1:-1]+MMs[-1].split()[0][-1]) - while(i1): + # Check if the current residue matches mm_indexes. + if parts[2] == mm_indexes[index][0] and parts[3] == mm_indexes[index][1]: + if len(temp_MMs) > 1: MMs.append(temp_MMs[-2]) MMs.append(temp_MMs[-1]) - temp_MMs=[] - col1=(line.split()[4][0]+line.split()[0]) - while(len(col1)<7): - col1+=' ' - col2=float(line.split()[6]) - MMs.append(col1+'%21.10f\n' % col2) - prev_MM=1 + temp_MMs = [] + col1 = parts[4][0] + parts[0] + col1 = col1.ljust(7) + col2 = float(parts[6]) + MMs.append(col1 + '%21.10f\n' % col2) + prev_MM_flag = 1 else: - prev_MM=0 - prev_resid=line.split()[2] + prev_MM_flag = 0 + prev_resid = parts[2] return MMs - + + +def charges_from_spec_topol(topol_lines, last_atom): + """ + Generate charge lines from a specialized topology file.(in the case that several .itp files are used) + Each line is formatted with an atom name, the charge (from the topology), and a fixed zero multipole. + + Parameters: + topol_lines (list of str): Lines from a specialized topology file. + last_atom (int): The atom counter from which to continue numbering. + + Returns: + outlines (list of str): Formatted charge lines. + """ + i = last_atom + col3 = ' 0.0000000000' + outlines = [] + for line in topol_lines: + # Bonds section beginning means atom charges section is done + if '[ bonds ]' in line: + return outlines + # Atom lines have first character as space, ignore any other lines + if line[0] != ' ': + continue + # Atom ID in topology is not "correct." Instead of using this, take last_atom to be the + # 'continuation' point. + else: + i += 1 + col2 = f"{float(line.split()[6]):14.10f}" + atomname = line.split()[4][0] + str(i) + col1 = f"{atomname}".ljust(14) + outlines.append(f"{col1}{col2}{col3}\n") + return outlines + + def get_screen(charges): - screens=[] + """ + Generate screening lines for a list of charge lines. + For each atom in charges, a screening line is produced with fixed screening parameters. + Screening terms are not "read" or generated. They are all the same. + + Parameters: + charges (list of str): List of formatted charge lines. + + Returns: + screens (list of str): Formatted screening lines. + """ + screens = [] + #every atom that has charges listed also needs screen paramters. for atom in charges: - col1=atom.split()[0] - while(len(col1)<7): - col1+=' ' - col2=' 1.0000000000 10.0000000000\n' - screens.append(col1+col2) + col1 = atom.split()[0] + col1 = col1.ljust(7) + # Screening parameters are fixed (e.g., vdW scaling and cutoff) + col2 = ' 1.0000000000 10.0000000000\n' + screens.append(col1 + col2) return screens - -def main(efp_g96, full_g96, topol): - with open(efp_g96,'r') as inp: - shell_lines=inp.readlines() - with open(full_g96,'r') as efp: - full_lines=efp.readlines() - with open(topol,'r') as top: - topol_lines=top.readlines() - - efp_residues=get_EFPs(shell_lines) - all_residues=get_EFPs(full_lines) - mm_residues=[] + + +def main(efp_g96, full_g96, topol_file): + """ + Main routine: + - Reads the EFP region structure file, full structure file, and topology file. + - Extracts residue information and determines which residues belong to MM. + - Extracts MM coordinates, charges, and screening parameters. + - Writes the results to "prot.efp". + """ + with open(efp_g96, 'r') as inp: + shell_lines = inp.readlines() + with open(full_g96, 'r') as efp: + full_lines = efp.readlines() + with open(topol_file, 'r') as top: + topol_lines = top.readlines() + + # Get residue information from the EFP and full structure files. + efp_residues = get_EFPs(shell_lines) + all_residues = get_EFPs(full_lines) + + # Separate residues into those for MM processing and those considered as cofactors. + mm_residues = [] + separate_topol = [] for res in all_residues: - if(res[1]=='XXX'): + # Skip residues with name "XXX"; these are link atoms + if res[1] == 'XXX': continue + # Skip residues that are in the EFP region; not needed for classical region elif res in efp_residues: continue - else: + # If residue name is not in known cofactors, add to MM residues. + # known_cofactors are residues that have separate topology and will not be found + # in the standard topology file. This script expeects to find an .itp file + # for every known_cofactor that will be used instead of the master topology. + elif res[1] not in known_cofactors: mm_residues.append(res) - MM_coords=get_MM_coords(mm_residues,full_lines) - MM_charge=get_MM_charges(mm_residues,topol_lines) - MM_screen2=get_screen(MM_charge) - with open('test_mm.efp','w') as outfile: + else: + separate_topol.append(res) + + # Extract MM coordinates and charges. + MM_coords = get_MM_coords(mm_residues, full_lines) + MM_charge = get_MM_charges(mm_residues, topol_lines) + + # For residues in separate_topol, obtain additional charges from their specific topology. + for residue in separate_topol: + # Use the last atom from MM_charge to set the numbering + last_atom_str = MM_charge[-1].split()[0] + # Remove extra spaces and extract the numeric part (assuming format like "X") + last_ID = int(last_atom_str.strip()[1:]) + # This example expects .itp contained in a folder named "amber03.ff" + # Change this as needed! + topol_filename = 'amber03.ff/' + residue[1] + '.itp' + with open(topol_filename, 'r') as toplines_file: + spec_topol_lines = toplines_file.readlines() + temp_charges = charges_from_spec_topol(spec_topol_lines, last_ID) + for atom_charge in temp_charges: + MM_charge.append(atom_charge) + + MM_screen2 = get_screen(MM_charge) + + # Write the output file with coordinates, charges, and screening information. + # Headings and sections are written explicitly here. + with open('prot.efp', 'w') as outfile: outfile.write(' $PROT\n') outfile.write('TITLE\n') outfile.write(' COORDINATES (BOHR)\n') @@ -178,7 +345,9 @@ def main(efp_g96, full_g96, topol): outfile.write('STOP\n') outfile.write(' $END') + if __name__ == "__main__": + # Example usage: python script.py efp_pair53004.g96 confout_pair53004.g96 edit_topol.itp main(sys.argv[1], sys.argv[2], sys.argv[3]) - #main('efp_pair53004.g96','confout_pair53004.g96','edit_topol.itp') - # efp_structure, full_strucuter, topol.top \ No newline at end of file + # For testing: + # main('efp_pair53004.g96', 'confout_pair53004.g96', 'edit_topol.itp')