Formulaires/Rules.py

#!/usr/bin/env python3

# MIT License
#
# Copyright (c) 2016 Nathanaël Restori
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import argparse
import re

parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs='+', type=argparse.FileType('r'))
args = parser.parse_args()

#TODO: list forbidden characters (|)
# \\\!|\\\;|\\\:|\\\,
# hphantom ?, hspace ?
# e^
# cte, text{, }, t.q.

rules_bef = [
                # Add space before and after $ unless at the beginning or the end of a line, after a { or a ( and before a } or a )
                {'symbol': r'(?<!^)\$',         '+<': ' '                                                               },
                {'symbol': r'\$(?!$)',                      '+>': ' '                                                   },
                {'symbol': r'(?<=\( |{ )\$',                            '-<': r' ',                                     },
                {'symbol': r'\$(?= \)| })',                                         '->': r' ',                         },
                # No space after \text
                {'symbol': r'\\text',                                               '->': r' '                          },
            ]

rules_math = [
                # Add space around
                {'symbol': r'=',                '+<': ' ',  '+>': ' ',                                                  },
                {'symbol': r'\\cdot',           '+<': ' ',  '+>': ' ',                                                  },
                {'symbol': r'\\quad',           '+<': ' ',  '+>': ' ',                                                  },
                {'symbol': r'\\leftrightarrow', '+<': '\\quad ',  '+>': ' \\quad',                                      },
                {'symbol': r'\\Leftrightarrow', '+<': '\\quad ',  '+>': ' \\quad',                                      },
                {'symbol': r'\\Leftarrow',      '+<': '\\quad ',  '+>': ' \\quad',                                      },
                {'symbol': r'\\Rightarrow',     '+<': '\\quad ',  '+>': ' \\quad',                                      },
                # Standard functions
                {'symbol': '(arc)?sinh?',       '+<': '\\', '+>': ' ',                                                  'w!>': r'h? ?(?:\^(?:{.*}|.))? \\left',},
                {'symbol': '(arc)?cosh?',       '+<': '\\', '+>': ' ',                                                  'w!>': r'h? ?(?:\^(?:{.*}|.))? \\left',},
                {'symbol': '(arc)?tanh?',       '+<': '\\', '+>': ' ',                                                  'w!>': r'h? ?(?:\^(?:{.*}|.))? \\left',},
                {'symbol': '(?<!{)min(?!})',    '+<': '\\', '+>': ' ',                                                  },
                {'symbol': '(?<!{)max(?!})',    '+<': '\\', '+>': ' ',                                                  },
                {'symbol': 'ln',                '+<': '\\', '+>': ' ',                                                  'w!>': r' ?(?:_(?:{.*}|.))? \\left| \\abs',},
                {'symbol': 'log',               '+<': '\\', '+>': ' ',                                                  'w!>': r' ?(?:_(?:{.*}|.))? \\left| \\abs',},
                {'symbol': 'lim(?!its)',        '+<': '\\', '+>': ' ',                                                  },
                # \left or \right before delimiter and space after
                {'symbol': r'\(',                           '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\[',                           '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\\{',                          '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\\langle',                     '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\)',                           '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\]',                           '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\\}',                          '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\\rangle',                     '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'\\\|',                         '+>': ' ',                          'w!<': r'\\left|right', },
                {'symbol': r'(?<!\\)\|',                    '+>': ' ',                          'w!<': r'\\left|right', },
                # Space before \left or \right but not after
                {'symbol': r'\\left',           '+<': ' ',                          '->': r' ',                         },
                {'symbol': r'\\right',          '+<': ' ',                          '->': r' ',                         },
                # No space before ^, _ and !
                {'symbol': r'\^',                                       '-<': r' ',                                     },
                {'symbol': r'_',                                        '-<': r' ',                                     },
                {'symbol': r'!',                                        '-<': r' ',                                     },
                # No space after { and before } (but keep after \{ and after \}
                {'symbol': r'(?<!\\){',                                             '->': r' ',                         },
                {'symbol': r'(?<!\\)}',                                 '-<': r' ',                                     },
            ]

rules_text = [
                # Use non-breaking space before :
                {'symbol': r':',                '+<': '~',                                                              }, # Add space ?
                {'symbol': r'~',                                        '-<': r' ', '->': r' ',                         },
             ]

rules_end = [
                # Correct spacing around punctuation.
                {'symbol': r',',                    '+>': ' ',  '-<': r' ',                                             },
                {'symbol': r';',                                '-<': r' ',                                             }, # Do not add space, cause problems in [a;b]
                # Remove trailing whitespaces
                {'symbol': r'$',                                '-<': r'[ \t]*',                                        },
            ]

# {} after ^ and _ ?
# \text{, } vs something else ?
# Ensure no cdot after partial frac ( frac{\partial U}{\partial \phi} \cdot)

def apply_rules(text, rules):
    for s in rules:
        if s.get('+<'):
            regex = r'(?:' + re.escape(s.get('+<')) + r')?(' + s.get('symbol') + r')'
            subst = s.get('+<').replace('\\', '\\\\') + r'\1'
            text = re.sub(regex, subst, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)

        if s.get('+>'):
            regex = r'(' + s.get('symbol') + r')(?:' + re.escape(s.get('+>')) + r')?'
            subst = r'\1' + s.get('+>').replace('\\', '\\\\')
            text = re.sub(regex, subst, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)

        if s.get('-<'):
            regex = r'(?:' + s.get('-<') + r')(' + s.get('symbol') + r')'
            subst = r'\1'
            text = re.sub(regex, subst, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)

        if s.get('->'):
            regex = r'(' + s.get('symbol') + r')(?:' + s.get('->') + r')'
            subst = r'\1'
            text = re.sub(regex, subst, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)

        if s.get('w!<'):
            regex = r'(?<!' + s.get('w!<') + r')(' + s.get('symbol') + r')'
            # use findall
            result = re.search(regex, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)
            if result:
                print("In file " + file_current.name + ": missing " + s.get('w!<') + " before " + s.get('symbol') + " (regex: " + regex + ")")
                # Print what's around match
                #print(text[result.start()-250:result.end()+250])
                #print(text[result.start():result.end()])
                #print(text[result.start()-1:result.end()+1])

                # Print something like:
                #(?<!\\left)\[(.*?)(?<!\\right)\]
                #\left[\1\right]

        if s.get('w!>'):
            regex = r'(' + s.get('symbol') + r')(?!' + s.get('w!>') + r')'
            # use findall
            result = re.search(regex, text, flags=re.MULTILINE | re.DOTALL | re.UNICODE)
            if result:
                print("In file " + file_current.name + ": missing " + s.get('w!>') + " after " + s.get('symbol') + " (regex: " + regex + ")")
                # Print what's around match
                print(text)
                print(text[result.start()-250:result.end()+250])
                print(text[result.start():result.end()])
                print(text[result.start()-1:result.end()+1])
                print(text[result.start()-10:result.end()+10])

    return text

for file_current in args.infile:
    file_content = file_current.read()
    file_original = file_content

    #TODO: add other cases (\$ for example)
    ## Check for $ in comments (we will have troubles if a comment contain an odd number of $)
    #if re.search(r'%.*\$', file_content, flags=re.MULTILINE | re.UNICODE):
        #print("Warning, file " + file_current.name + " contain $ in comments, ignoring file")
        #continue

    file_content = apply_rules(file_content, rules_bef)

    splited = re.split(r'(\$.*?\$)', file_content, flags=re.MULTILINE | re.DOTALL | re.UNICODE) # Split file content in math parts and normal parts

    for i in range(1, len(splited), 2):
        splited_b = re.split(r'(\\text{.*?})', splited[i], flags=re.MULTILINE | re.DOTALL | re.UNICODE) # Split file content in math parts and normal parts
        for j in range(0, len(splited_b), 2):
            splited_b[j] = apply_rules(splited_b[j], rules_math)
        splited[i] = ''.join(splited_b)

    for i in range(0, len(splited), 2):
        splited[i] = apply_rules(splited[i], rules_text)

    file_content = ''.join(splited)

    file_content = apply_rules(file_content, rules_end)

    file_content = re.sub(r'\\left\\\| (.*?) \\right\\\|', r'\\norm{\1}', file_content, flags=re.MULTILINE)
    file_content = re.sub(r'\\left\| (.*?) \\right\|', r'\\abs{\1}', file_content, flags=re.MULTILINE)
    file_content = re.sub(r'\\left< (.*?) \\right>', r'\\left\\langle \1 \\right\\rangle}', file_content, flags=re.MULTILINE)

    file_content = re.sub(r'\.\.\.', r'\\dots', file_content, flags=re.MULTILINE)

    file_content = re.sub(r' \\newline\n&', r'\n&', file_content, flags=re.MULTILINE)       # Ensure no newline at the end of a cell
    file_content = re.sub(r' \\\\\n&',      r'\n&', file_content, flags=re.MULTILINE)       # Ensure no newline at the end of a cell
    file_content = re.sub(r' \\newline\n\\\\', r'\n\\\\', file_content, flags=re.MULTILINE) # Ensure no newline at the end of a cell
    file_content = re.sub(r' \\\\\n\\\\',      r'\n\\\\', file_content, flags=re.MULTILINE) # Ensure no newline at the end of a cell
    file_content = re.sub(r' \\\\\n( *)\\end\{tabu\}',      r'\n\1\\end{tabu}', file_content, flags=re.MULTILINE) # Ensure no newline at the end of a cell

    file_content = re.sub(r'\\\\ +\\hline',      r'\\\\\\hline', file_content, flags=re.MULTILINE) # Remove spaces between \\ and \hline

    file_content = apply_rules(file_content, rules_end)

    # Save only if needed
    if file_original == file_content:
        print("File untouched: " + file_current.name)
    else:
        print("File modified: " + file_current.name)
        with open(file_current.name, "w") as f:
            f.seek(0)
            f.truncate()
            f.write(file_content)