Skip to content

Commit 68657ab

Browse files
author
Tom Keefe
committed
allow for including headers/footers via command line
1 parent f929ae1 commit 68657ab

File tree

9 files changed

+171
-18
lines changed

9 files changed

+171
-18
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,12 @@ Converts the source document to HTML.
285285
* `ignore_empty_paragraphs`: by default, empty paragraphs are ignored.
286286
Set this option to `False` to preserve empty paragraphs in the output.
287287

288+
* `include_headers_and_footers`: by default, headers and footers are not included in the output.
289+
Set this option to `True` to include them at the start and end of the output.
290+
291+
* `deduplicate_headers_and_footers`: by default, all headers and footers are included.
292+
Set this option to `True` to only include unique headers and footers.
293+
288294
* `id_prefix`:
289295
a string to prepend to any generated IDs,
290296
such as those used by bookmarks, footnotes and endnotes.

mammoth/conversion.py

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ def convert_document_element_to_html(element,
1616
convert_image=None,
1717
id_prefix=None,
1818
output_format=None,
19-
ignore_empty_paragraphs=True):
19+
ignore_empty_paragraphs=True,
20+
include_headers_and_footers=False,
21+
deduplicate_headers_and_footers=False):
2022

2123
if style_map is None:
2224
style_map = []
@@ -42,6 +44,8 @@ def convert_document_element_to_html(element,
4244
convert_image=convert_image,
4345
id_prefix=id_prefix,
4446
ignore_empty_paragraphs=ignore_empty_paragraphs,
47+
include_headers_and_footers=include_headers_and_footers,
48+
deduplicate_headers_and_footers=deduplicate_headers_and_footers,
4549
note_references=[],
4650
comments=comments,
4751
)
@@ -62,11 +66,22 @@ def copy(self, **kwargs):
6266

6367

6468
class _DocumentConverter(documents.element_visitor(args=1)):
65-
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
69+
def __init__(self,
70+
messages,
71+
style_map,
72+
convert_image,
73+
id_prefix,
74+
ignore_empty_paragraphs,
75+
include_headers_and_footers,
76+
deduplicate_headers_and_footers,
77+
note_references,
78+
comments):
6679
self._messages = messages
6780
self._style_map = style_map
6881
self._id_prefix = id_prefix
6982
self._ignore_empty_paragraphs = ignore_empty_paragraphs
83+
self._include_headers_and_footers = include_headers_and_footers
84+
self._deduplicate_headers_and_footers = deduplicate_headers_and_footers
7085
self._note_references = note_references
7186
self._referenced_comments = []
7287
self._convert_image = convert_image
@@ -81,17 +96,27 @@ def visit_image(self, image, context):
8196

8297
def visit_document(self, document, context):
8398
nodes = self._visit_all(document.children, context)
99+
100+
headers = []
101+
footers = []
102+
103+
if self._include_headers_and_footers:
104+
headers = self.visit_headers(document.headers, context)
105+
footers = self.visit_footers(document.footers, context)
106+
84107
notes = [
85108
document.notes.resolve(reference)
86109
for reference in self._note_references
87110
]
88111
notes_list = html.element("ol", {}, self._visit_all(notes, context))
112+
89113
comments = html.element("dl", {}, [
90114
html_node
91115
for referenced_comment in self._referenced_comments
92116
for html_node in self.visit_comment(referenced_comment, context)
93117
])
94-
return nodes + [notes_list, comments]
118+
119+
return headers + nodes + [notes_list, comments] + footers
95120

96121

97122
def visit_paragraph(self, paragraph, context):
@@ -300,6 +325,49 @@ def visit_comment(self, referenced_comment, context):
300325
html.element("dd", {}, body),
301326
]
302327

328+
def visit_header(self, header, context):
329+
return self._visit_all(header.children, context)
330+
331+
def visit_headers(self, headers, context):
332+
all_headers = [
333+
html_node
334+
for h in headers
335+
for html_node in self.visit_header(h, context)
336+
]
337+
338+
if not self._deduplicate_headers_and_footers:
339+
return all_headers
340+
341+
header_values = set()
342+
filtered_headers = []
343+
for h in all_headers:
344+
if not h.to_text() in header_values:
345+
filtered_headers.append(h)
346+
header_values.add(h.to_text())
347+
348+
return filtered_headers
349+
350+
def visit_footer(self, footer, context):
351+
return self._visit_all(footer.children, context)
352+
353+
def visit_footers(self, footers, context):
354+
all_footers = [
355+
html_node
356+
for f in footers
357+
for html_node in self.visit_footer(f, context)
358+
]
359+
360+
if not self._deduplicate_headers_and_footers:
361+
return all_footers
362+
363+
footer_values = set()
364+
filtered_footers = []
365+
for h in all_footers:
366+
if not h.to_text() in footer_values:
367+
filtered_footers.append(h)
368+
footer_values.add(h.to_text())
369+
370+
return filtered_footers
303371

304372
def _visit_all(self, elements, context):
305373
return [

mammoth/documents.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ class HasChildren(Element):
1414
class Document(HasChildren):
1515
notes = cobble.field()
1616
comments = cobble.field()
17+
headers = cobble.field()
18+
footers = cobble.field()
1719

1820
@cobble.data
1921
class Paragraph(HasChildren):
@@ -97,12 +99,16 @@ class Image(Element):
9799
open = cobble.field()
98100

99101

100-
def document(children, notes=None, comments=None):
102+
def document(children, notes=None, comments=None, headers=None, footers=None):
101103
if notes is None:
102104
notes = Notes({})
103105
if comments is None:
104106
comments = []
105-
return Document(children, notes, comments=comments)
107+
if headers is None:
108+
headers = []
109+
if footers is None:
110+
footers = []
111+
return Document(children, notes, comments=comments, headers=headers, footers=footers)
106112

107113
def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
108114
if indent is None:
@@ -252,5 +258,17 @@ class CommentReference(Element):
252258

253259
comment_reference = CommentReference
254260

261+
@cobble.data
262+
class Header(HasChildren):
263+
pass
264+
265+
header = Header
266+
267+
@cobble.data
268+
class Footer(HasChildren):
269+
pass
270+
271+
footer = Footer
272+
255273
def element_visitor(args):
256274
return cobble.visitor(Element, args=args)

mammoth/docx/__init__.py

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from .. import results, lists, zips
77
from .document_xml import read_document_xml_element
8+
from .header_xml import (read_header_xml_element, read_footer_xml_element)
89
from .content_types_xml import empty_content_types, read_content_types_xml_element
910
from .relationships_xml import read_relationships_xml_element, Relationships
1011
from .numbering_xml import read_numbering_xml_element, Numbering
@@ -27,12 +28,14 @@ def read(fileobj):
2728
zip_file,
2829
part_paths=part_paths,
2930
)
30-
31+
3132
return results.combine([
3233
_read_notes(read_part_with_body, part_paths),
3334
_read_comments(read_part_with_body, part_paths),
35+
_read_headers(read_part_with_body, part_paths),
36+
_read_footers(read_part_with_body, part_paths)
3437
]).bind(lambda referents:
35-
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
38+
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], headers=referents[2], footers=referents[3], part_paths=part_paths)
3639
)
3740

3841

@@ -43,6 +46,8 @@ class _PartPaths(object):
4346
endnotes = cobble.field()
4447
footnotes = cobble.field()
4548
numbering = cobble.field()
49+
headers = cobble.field()
50+
footers = cobble.field()
4651
styles = cobble.field()
4752

4853

@@ -55,21 +60,24 @@ def _find_part_paths(zip_file):
5560
_find_relationships_path_for(document_filename),
5661
)
5762

58-
def find(name):
63+
def find(name, multiple=False):
5964
return _find_part_path(
6065
zip_file=zip_file,
6166
relationships=document_relationships,
6267
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
6368
fallback_path="word/{0}.xml".format(name),
6469
base_path=zips.split_path(document_filename)[0],
70+
multiple=multiple
6571
)
66-
72+
6773
return _PartPaths(
6874
main_document=document_filename,
6975
comments=find("comments"),
7076
endnotes=find("endnotes"),
7177
footnotes=find("footnotes"),
7278
numbering=find("numbering"),
79+
headers=find("header", multiple=True),
80+
footers=find("footer", multiple=True),
7381
styles=find("styles"),
7482
)
7583

@@ -88,7 +96,7 @@ def _find_document_filename(zip_file, relationships):
8896
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
8997

9098

91-
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
99+
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path, multiple=False):
92100
targets = [
93101
zips.join_path(base_path, target).lstrip("/")
94102
for target in relationships.find_targets_by_type(relationship_type)
@@ -97,7 +105,7 @@ def _find_part_path(zip_file, relationships, relationship_type, base_path, fallb
97105
if len(valid_targets) == 0:
98106
return fallback_path
99107
else:
100-
return valid_targets[0]
108+
return valid_targets if multiple else valid_targets[0]
101109

102110

103111
def _read_notes(read_part_with_body, part_paths):
@@ -111,7 +119,7 @@ def _read_notes(read_part_with_body, part_paths):
111119
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
112120
default=_empty_result,
113121
)
114-
122+
115123
return results.combine([footnotes, endnotes]).map(lists.flatten)
116124

117125

@@ -122,14 +130,42 @@ def _read_comments(read_part_with_body, part_paths):
122130
default=_empty_result,
123131
)
124132

133+
def _read_headers(read_part_with_body, part_paths):
134+
if type(part_paths.headers) == str:
135+
header_paths = [part_paths.headers]
136+
else:
137+
header_paths = part_paths.headers
138+
139+
headers = [
140+
read_part_with_body(header,
141+
lambda root, body_reader: read_header_xml_element(root, body_reader=body_reader),
142+
default=_empty_result) for header in header_paths]
143+
return [h for h in headers if h.value != []]
144+
145+
146+
def _read_footers(read_part_with_body, part_paths):
147+
if type(part_paths.footers) == str:
148+
footer_paths = [part_paths.footers]
149+
else:
150+
footer_paths = part_paths.footers
151+
152+
footers = [
153+
read_part_with_body(footer,
154+
lambda root, body_reader: read_footer_xml_element(root, body_reader=body_reader),
155+
default=_empty_result) for footer in footer_paths]
156+
157+
return [f for f in footers if f.value != []]
158+
125159

126-
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
160+
def _read_document(zip_file, read_part_with_body, notes, comments, headers, footers, part_paths):
127161
return read_part_with_body(
128162
part_paths.main_document,
129163
partial(
130164
read_document_xml_element,
131165
notes=notes,
132166
comments=comments,
167+
headers=headers,
168+
footers=footers
133169
),
134170
)
135171

mammoth/docx/document_xml.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ def read_document_xml_element(
55
element,
66
body_reader,
77
notes=None,
8-
comments=None):
8+
comments=None,
9+
headers=None,
10+
footers=None):
911

1012
if notes is None:
1113
notes = []
@@ -17,5 +19,7 @@ def read_document_xml_element(
1719
.map(lambda children: documents.document(
1820
children,
1921
notes=documents.notes(notes),
20-
comments=comments
22+
comments=comments,
23+
headers=headers,
24+
footers=footers
2125
))

mammoth/docx/header_xml.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import functools
2+
from .. import documents
3+
4+
def _read_extremity(extremity, element, body_reader):
5+
return body_reader.read_all(element.children) \
6+
.map(lambda children: extremity(children))
7+
8+
read_header_xml_element = functools.partial(_read_extremity, documents.header)
9+
read_footer_xml_element = functools.partial(_read_extremity, documents.footer)

mammoth/html/nodes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ class Node(object):
99
class TextNode(Node):
1010
value = cobble.field()
1111

12+
def to_text(self):
13+
return self.value
14+
1215

1316
@cobble.data
1417
class Tag(object):
@@ -52,6 +55,8 @@ def separator(self):
5255
def is_void(self):
5356
return not self.children and self.tag_name in self._VOID_TAG_NAMES
5457

58+
def to_text(self):
59+
return "".join([s.to_text() for s in iter(self.children)])
5560

5661
@cobble.visitable
5762
class ForceWrite(Node):

mammoth/options.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def read_options(options):
1919
style_map += _default_style_map
2020

2121
options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
22+
23+
options["include_headers_and_footers"] = options.get("include_headers_and_footers", False)
24+
options["deduplicate_headers_and_footers"] = options.get("deduplicate_headers_and_footers", False)
25+
2226
options["style_map"] = style_map
2327
return read_style_map_result.map(lambda _: options)
2428

mammoth/results.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ def combine(results):
3131
values = []
3232
messages = []
3333
for result in results:
34-
values.append(result.value)
35-
for message in result.messages:
36-
messages.append(message)
34+
if isinstance(result, list):
35+
values.append([r.value for r in result])
36+
else:
37+
values.append(result.value)
38+
for message in result.messages:
39+
messages.append(message)
3740

3841
return Result(values, messages)
3942

0 commit comments

Comments
 (0)