This repository has been archived on 2016-08-24. You can view files and clone it, but cannot push or open issues or pull requests.
proofTeX/detex.py

298 lines
7.0 KiB
Python
Executable File

#!/usr/bin/env python3
# proofTeX - Tools for proofing LaTeX documents - detex.py
# Copyright © 2016 RunasSudo (Yingtong Li)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import copy
import ply.lex
import re
tokens = (
'BEGIN_DOCUMENT', 'END_DOCUMENT',
'DOLLAR', 'ESCDOLLAR',
'DDOLLAR',
'BEGIN_ALIGN', 'END_ALIGN', 'INTERTEXT',
'BEGIN_IGNORED_ENV', 'END_IGNORED_ENV',
'BEGIN_ENVIRONMENT', 'END_ENVIRONMENT',
'BEGIN_GROUP', 'END_GROUP', 'ESCBRACE',
'PERCENT', 'ESCPERCENT',
'CHAR', 'NEWLINE',
'SPECIAL_MACRO', 'MACRO',
)
states = (
('document', 'exclusive'),
('inline', 'exclusive'),
('display', 'exclusive'),
('align', 'exclusive'),
('ignoredenv', 'exclusive'),
('group', 'exclusive'),
('intertext', 'exclusive'),
('specialmacro', 'exclusive'),
('comment', 'exclusive'),
)
def _super(text, t, superof=None):
tmplexer = clone_lexer(baselexer)
for s in t.lexer.lexstatestack:
if superof is not None and s == superof:
break
if s != t.lexer.lexstate:
tmplexer.push_state(s)
tmplexer.input(text)
return ''.join([tok.value for tok in tmplexer])
def _value(t, text, grouptext=None):
if t.lexer.lexstate == 'group':
t.lexer.stack[-1] += (grouptext if grouptext is not None else text)
return None
else:
t.value = text
return t
def t_ANY_ESCPERCENT(t):
r'\\%'
t.value = _super('%', t)
return t
def t_ANY_ESCBRACE(t):
r'\\[{}]'
t.value = t.value[1:]
return t
def t_ANY_ESCDOLLAR(t):
r'\\\$'
t.value = '$'
return t
def t_ANY_PERCENT(t):
r'%'
t.lexer.push_state('comment')
def t_comment_CHAR(t):
r'.'
pass
def t_comment_NEWLINE(t):
r'\n'
t.lexer.pop_state()
def t_document_DDOLLAR(t):
r'\$\$'
t.lexer.stack.append('')
t.lexer.push_state('display')
def t_document_DOLLAR(t):
r'\$'
t.lexer.stack.append('')
t.lexer.push_state('inline')
def t_display_DDOLLAR(t):
r'\$\$'
text = t.lexer.stack.pop()
t.lexer.pop_state()
if not re.search(r'[^a-zA-Z0-9_^{}αβγδεζηθικλμνξπρστυφχψω ]', text):
if args.count:
t.value = 'MATHS'
else:
t.value = text
return t
else:
return None
def t_inline_DOLLAR(t):
r'\$'
return t_display_DDOLLAR(t)
def t_inline_CHAR(t):
r'.'
t.lexer.stack[-1] += t.value
def t_inline_NEWLINE(t):
r'\n'
pass
t_display_CHAR = t_inline_CHAR
t_display_NEWLINE = t_inline_NEWLINE
def t_INITIAL_BEGIN_DOCUMENT(t):
r'\\begin\s*{\s*document\s*}'
t.lexer.push_state('document')
def t_document_END_DOCUMENT(t):
r'\\end\s*{\s*document\s*}'
t.lexer.pop_state()
def t_document_BEGIN_ALIGN(t):
r'\\begin\s*{\s*align\*?\s*}'
t.lexer.push_state('align')
def t_align_END_ALIGN(t):
r'\\end\s*{\s*align\*?\s*}'
t.lexer.pop_state()
def t_align_INTERTEXT(t):
r'\\intertext\s*{'
t.lexer.stack.append('')
t.lexer.push_state('intertext')
def t_intertext_END_GROUP(t):
r'}'
t.value = _super(t.lexer.stack.pop(), t, 'align')
t.lexer.pop_state()
return t
t_intertext_DOLLAR = t_document_DOLLAR
def t_intertext_CHAR(t):
r'.'
t.lexer.stack[-1] += t.value
def t_intertext_NEWLINE(t):
r'.'
pass
def t_ANY_BEGIN_IGNORED_ENV(t):
r'\\begin\s*{(figure|table|nocount)\*?}'
t.lexer.push_state('ignoredenv')
def t_ignoredenv_END_IGNORED_ENV(t):
r'\\end\s*{(figure|table|nocount)\*?}'
t.lexer.pop_state()
def t_ignoredenv_CHAR(t):
r'.'
pass
def t_ignoredenv_NEWLINE(t):
r'\n'
pass
def t_document_BEGIN_ENVIRONMENT(t):
r'\\begin\s*({.+?})+'
pass
def t_document_END_ENVIRONMENT(t):
r'\\end\s*{.+?}'
pass
def t_ANY_SPECIAL_MACRO(t):
r'\\(autoref|ref|autocite|textcite|label|footnote)\s*{'
t.lexer.stack.append(re.match(t_ANY_SPECIAL_MACRO.__doc__, t.value).group(1))
t.lexer.stack.append('')
t.lexer.push_state('specialmacro')
def t_specialmacro_END_GROUP(t):
r'}'
grouptext = _super(t.lexer.stack.pop(), t)
macro = t.lexer.stack.pop()
t.lexer.pop_state()
if args.tts:
if macro == 'label':
return None
else:
t.value = grouptext
return t
else:
# guess word counts
if macro == 'autoref':
return _value(t, 'REFTYPE NUMBER')
if macro == 'ref':
return _value(t, 'NUMBER')
if macro == 'autocite' or macro == 'label' or macro == 'footnote':
return None
if macro == 'textcite':
return _value(t, 'AUTHOR')
def t_specialmacro_CHAR(t):
r'.'
t.lexer.stack[-1] += t.value
def t_specialmacro_NEWLINE(t):
r'\n'
t.lexer.stack[-1] += t.value
def t_inline_BEGIN_GROUP(t):
r'{'
pass
def t_inline_END_GROUP(t):
r'}'
pass
t_display_BEGIN_GROUP = t_inline_BEGIN_GROUP
t_display_END_GROUP = t_inline_END_GROUP
def t_ANY_BEGIN_GROUP(t):
r'{'
t.lexer.stack.append('')
t.lexer.push_state('group')
def t_group_END_GROUP(t):
r'}'
grouptext = _super(t.lexer.stack.pop(), t)
t.lexer.pop_state()
return _value(t, grouptext, '{' + grouptext + '}')
def t_group_CHAR(t):
r'.'
t.lexer.stack[-1] += t.value
def t_group_NEWLINE(t):
r'\n'
t.lexer.stack[-1] += t.value
def t_ANY_MACRO(t):
r'\\[a-zA-Z]+\*?(\[.*?\])?'
pass # Ignore any other macros
def t_INITIAL_CHAR(t):
r'.'
pass
def t_INITIAL_NEWLINE(t):
r'\n'
pass
def t_document_CHAR(t):
r'.'
return t
def t_document_NEWLINE(t):
r'\n'
return t
t_align_CHAR = t_INITIAL_CHAR
t_align_NEWLINE = t_INITIAL_NEWLINE
def t_ANY_error(t):
raise Exception('Illegal character \'%s\''.format(t.value[0]))
# -----
def clone_lexer(lex):
IGNORED = ['lexre', 'lexstatere']
ignored = {}
for ignore in IGNORED:
ignored[ignore] = getattr(lex, ignore)
setattr(lex, ignore, None)
newlex = copy.deepcopy(lex)
for ignore in IGNORED:
setattr(lex, ignore, ignored[ignore])
setattr(newlex, ignore, ignored[ignore])
return newlex
parser = argparse.ArgumentParser(description='Strip LaTeX from a file and optionally count words or format for text-to-speech')
parser.add_argument('infile', help='The input LaTeX file')
parser.add_argument('--document', action='store_true', help='Treat entire input as content of document environment')
parser.add_argument('--count', action='store_true', help='Count words')
parser.add_argument('--tts', action='store_true', help='Format for text-to-speech')
args = parser.parse_args()
baselexer = ply.lex.lex()
baselexer.stack = ['']
lexer = clone_lexer(baselexer)
with open(args.infile, 'r', encoding='utf8') as f:
data = f.read()
data = re.sub(r'\\iffalse.*?\\fi', r'', data)
if args.document:
lexer.push_state('document')
lexer.input(data)
string = ''.join([tok.value for tok in lexer])
string = re.sub(r'\s+([,.!?])', r'\1', string)
string = re.sub(r'(\s)\s+', r'\1', string)
string = string.strip()
if args.count:
print(len(re.findall(r' ', string)) + 1)
else:
print(string)