PHP读取docx文档内容

引言

客户需求, 需要从docx文档读取内容并且做简单格式化, 难点就在于如何读取docx格式并且转换为php可以识别的字符串形式, 惯例先贴代码.

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
/**
* Class Docx2Text
*
* Docx => String
*/
class Docx2Text
{
const SEPARATOR_TAB = "\t";

/**
* object zipArchive
*
* @var string
* @access private
*/
private $docx;

/**
* object domDocument from document.xml
*
* @var string
* @access private
*/
private $domDocument;

/**
* xml from document.xml
*
* @var string
* @access private
*/
private $_document;

/**
* xml from numbering.xml
*
* @var string
* @access private
*/
private $_numbering;

/**
* xml from footnote
*
* @var string
* @access private
*/
private $_footnote;

/**
* xml from endnote
*
* @var string
* @access private
*/
private $_endnote;

/**
* array of all the endnotes of the document
*
* @var string
* @access private
*/
private $endnotes;

/**
* array of all the footnotes of the document
*
* @var string
* @access private
*/
private $footnotes;

/**
* array of all the relations of the document
*
* @var string
* @access private
*/
private $relations;

/**
* array of characters to insert like a list
*
* @var string
* @access private
*/
private $numberingList;

/**
* the text content that will be exported
*
* @var string
* @access private
*/
private $textOuput;


/**
* boolean variable to know if a chart will be transformed to text
*
* @var string
* @access private
*/
private $chart2text;

/**
* boolean variable to know if a table will be transformed to text
*
* @var string
* @access private
*/
private $table2text;

/**
* boolean variable to know if a list will be transformed to text
*
* @var string
* @access private
*/
private $list2text;

/**
* boolean variable to know if a paragraph will be transformed to text
*
* @var string
* @access private
*/
private $paragraph2text;

/**
* boolean variable to know if footnotes will be extracteded
*
* @var string
* @access private
*/
private $footnote2text;

/**
* boolean variable to know if endnotes will be extracted
*
* @var string
* @access private
*/
private $endnote2text;

/**
* Construct
*
* @param $boolTransforms array of boolean values of which elements should be transformed or not
* @access public
*/

public function __construct($boolTransforms = array())
{
//table,list, paragraph, footnote, endnote, chart
if (isset($boolTransforms['table'])) {
$this->table2text = $boolTransforms['table'];
} else {
$this->table2text = true;
}

if (isset($boolTransforms['list'])) {
$this->list2text = $boolTransforms['list'];
} else {
$this->list2text = true;
}

if (isset($boolTransforms['paragraph'])) {
$this->paragraph2text = $boolTransforms['paragraph'];
} else {
$this->paragraph2text = true;
}

if (isset($boolTransforms['footnote'])) {
$this->footnote2text = $boolTransforms['footnote'];
} else {
$this->footnote2text = true;
}

if (isset($boolTransforms['endnote'])) {
$this->endnote2text = $boolTransforms['endnote'];
} else {
$this->endnote2text = true;
}

if (isset($boolTransforms['chart'])) {
$this->chart2text = $boolTransforms['chart'];
} else {
$this->chart2text = true;
}

$this->textOuput = '';
$this->docx = null;
$this->_numbering = '';
$this->numberingList = array();
$this->endnotes = array();
$this->footnotes = array();
$this->relations = array();

}

/**
*
* Extract the content of a word document and create a text file if the name is given
*
* @access public
* @param string $filename of the word document.
*
* @return string
*/

public function extract($filename = '')
{
if (empty($this->_document)) {
//xml content from document.xml is not got
exit('There is no content');
}

$this->domDocument = new DomDocument();
$this->domDocument->loadXML($this->_document);
//get the body node to check the content from all his children
$bodyNode = $this->domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'body');
//We get the body node. it is known that there is only one body tag
$bodyNode = $bodyNode->item(0);
foreach ($bodyNode->childNodes as $child) {
//the children can be a table, a paragraph or a section. We only implement the 2 first option said.
if ($this->table2text && $child->tagName == 'w:tbl') {
//this node is a table and the content is split with tabs if the variable table2text from the class is true
$this->textOuput .= $this->table($child) . $this->separator();
} else {
//this node is a paragraph
$this->textOuput .= $this->printWP($child) . ($this->paragraph2text ? $this->separator() : '');
}
}
if (!empty($filename)) {
$this->writeFile($filename, $this->textOuput);
} else {
return $this->textOuput;
}
}

/**
* Setter
*
* @access public
* @param $filename
*/
public function setDocx($filename)
{
$this->docx = new ZipArchive();
$ret = $this->docx->open($filename);
if ($ret === true) {
$this->_document = $this->docx->getFromName('word/document.xml');
} else {
exit('failed');
}
}

/**
* extract the content to an array from endnote.xml
*
* @access private
*/
private function loadEndNote()
{
if (empty($this->endnotes)) {
if (empty($this->_endnote)) {
$this->_endnote = $this->docx->getFromName('word/endnotes.xml');
}
if (!empty($this->_endnote)) {
$domDocument = new DomDocument();
$domDocument->loadXML($this->_endnote);
$endnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'endnote');
foreach ($endnotes as $endnote) {
$xml = $endnote->ownerDocument->saveXML($endnote);
$this->endnotes[$endnote->getAttribute('w:id')] = trim(strip_tags($xml));
}
}
}
}

/**
* Extract the content to an array from footnote.xml
*
* @access private
*/
private function loadFootNote()
{
if (empty($this->footnotes)) {
if (empty($this->_footnote)) {
$this->_footnote = $this->docx->getFromName('word/footnotes.xml');
}
if (!empty($this->_footnote)) {
$domDocument = new DomDocument();
$domDocument->loadXML($this->_footnote);
$footnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'footnote');
foreach ($footnotes as $footnote) {
$xml = $footnote->ownerDocument->saveXML($footnote);
$this->footnotes[$footnote->getAttribute('w:id')] = trim(strip_tags($xml));
}
}
}
}

/**
* Extract the styles of the list to an array
*
* @access private
*/
private function listNumbering()
{
$ids = array();
$nums = array();
//get the xml code from the zip archive
$this->_numbering = $this->docx->getFromName('word/numbering.xml');
if (!empty($this->_numbering)) {
//we use the domdocument to iterate the children of the numbering tag
$domDocument = new DomDocument();
$domDocument->loadXML($this->_numbering);
$numberings = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'numbering');
//there is only one numbering tag in the numbering.xml
$numberings = $numberings->item(0);
foreach ($numberings->childNodes as $child) {
$flag = true;//boolean variable to know if the node is the first style of the list
foreach ($child->childNodes as $son) {
if ($child->tagName == 'w:abstractNum' && $son->tagName == 'w:lvl') {
foreach ($son->childNodes as $daughter) {
if ($daughter->tagName == 'w:numFmt' && $flag) {
$nums[$child->getAttribute('w:abstractNumId')] = $daughter->getAttribute('w:val');//set the key with internal index for the listand the value it is the type of bullet
$flag = false;
}
}
} elseif ($child->tagName == 'w:num' && $son->tagName == 'w:abstractNumId') {
$ids[$son->getAttribute('w:val')] = $child->getAttribute('w:numId');//$ids is the index of the list
}
}
}
//once we know what kind of list there is in the documents, is prepared the bullet that the library will use
foreach ($ids as $ind => $id) {
if ($nums[$ind] == 'decimal') {
//if the type is decimal it means that the bullet will be numbers
$this->numberingList[$id][0] = range(1, 10);
$this->numberingList[$id][1] = range(1, 10);
$this->numberingList[$id][2] = range(1, 10);
$this->numberingList[$id][3] = range(1, 10);
} else {
//otherwise is *, and other characters
$this->numberingList[$id][0] = array('*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*');
$this->numberingList[$id][1] = array(chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175));
$this->numberingList[$id][2] = array(chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237));
$this->numberingList[$id][3] = array(chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248));
}
}
}
}

/**
* Extract the content of a w:p tag
*
* @access private
* @param $node object
* @return string
*/
private function printWP($node)
{
$ilvl = $numId = -1;
if ($this->list2text) {//transform the list in ooxml to formatted list with tabs and bullets
if (empty($this->numberingList)) {//check if numbering.xml is extracted from the zip archive
$this->listNumbering();
}
//use the xpath to get expecific children from a node
$xpath = new DOMXPath($this->domDocument);
$query = 'w:pPr/w:numPr';
$xmlLists = $xpath->query($query, $node);
$xmlLists = $xmlLists->item(0);

//if ($xmlLists->tagName == 'w:numPr') {
// if ($xmlLists->hasChildNodes()) {
// foreach ($xmlLists->childNodes as $child) {
// if ($child->tagName == 'w:ilvl') {
// $ilvl = $child->getAttribute('w:val');
// }elseif ($child->tagName == 'w:numId') {
// $numId = $child->getAttribute('w:val');
// }
// }
// }
//}
//if (($ilvl != -1) && ($numId != -1)) {
// //if is founded the style index of the list in the document and the kind of list
// $ret = '';
// for($i=-1; $i < $ilvl; $i++) {
// if(self::DEBUG) {
// $ret .= self::SEPARATOR_TAB_DEBUG;
// }
// else {
// $ret .= self::SEPARATOR_TAB;
// }
// }
// $ret .= array_shift($this->numberingList[$numId][$ilvl]) . ' ' . $this->toText($node); //print the bullet
//} else {
$ret = $this->toText($node);
//}
} else {
//if dont want to formatted lists, we strip from html tags
$ret = $this->toText($node);
}


//get the data from the charts
if ($this->chart2text) {
$query = 'w:r/w:drawing/wp:inline';
$xmlChart = $xpath->query($query, $node);
//get the relation id from the document, to get the name of the xml chart file from the relations to extract the xml code.
foreach ($xmlChart as $chart) {
foreach ($chart->childNodes as $child) {
foreach ($child->childNodes as $child2) {
foreach ($child2->childNodes as $child3) {
$rid = $child3->getAttribute('r:id');
}
}
}
}
//if (!empty($rid)) {
// if (empty($this->relations)) {
// $this->loadRelations();
// }
// //get the name of the chart xml file from the relations docuemnt
// $dataChart = new getDataFromXmlChart($this->docx->getFromName('word/' . $this->relations[$rid]['file']));
// if (in_array($this->chart2text, array(2, 'table'))) {
// $ret .= $this->printChartDataTable($dataChart);//formatted print of the chart data
// } else {
// $ret .= $this->printChartDataArray($dataChart);//formatted print of the chart data
// }
//}
}
//extract the expecific endnote to insert with the text content
if ($this->endnote2text) {
if (empty($this->endnotes)) {
$this->loadEndNote();
}
$query = 'w:r/w:endnoteReference';
$xmlEndNote = $xpath->query($query, $node);
foreach ($xmlEndNote as $note) {
$ret .= '[' . $this->endnotes[$note->getAttribute('w:id')] . '] ';
}
}
//extract the expecific footnote to insert with the text content
if ($this->footnote2text) {
if (empty($this->footnotes)) {
$this->loadFootNote();
}
$query = 'w:r/w:footnoteReference';
$xmlFootNote = $xpath->query($query, $node);
foreach ($xmlFootNote as $note) {
$ret .= '[' . $this->footnotes[$note->getAttribute('w:id')] . '] ';
}
}
if ((($ilvl != -1) && ($numId != -1)) || (1)) {
$ret .= $this->separator();
}

return $ret;
}

/**
* return a text end of line
*
* @access private
*/
private function separator()
{
return "\r\n";
}

/**
*
* Extract the content of a table node from the document.xml and return a text content
*
* @access private
* @param $node object
*
* @return string
*/
private function table($node)
{
$output = '';
if ($node->hasChildNodes()) {
foreach ($node->childNodes as $child) {
//start a new line of the table
if ($child->tagName == 'w:tr') {
foreach ($child->childNodes as $cell) {
//start a new cell
if ($cell->tagName == 'w:tc') {
if ($cell->hasChildNodes()) {
//
foreach ($cell->childNodes as $p) {
$output .= $this->printWP($p);
}
$output .= self::SEPARATOR_TAB;
}
}
}
}
$output .= $this->separator();
}
}
return $output;
}


/**
*
* Extract the content of a node from the document.xml and return only the text content and. stripping the html tags
*
* @access private
* @param $node object
*
* @return string
*/
private function toText($node)
{
$xml = $node->ownerDocument->saveXML($node);
return trim(strip_tags($xml));
}
}

// 实例化
$text = new Docx2Text();
// 加载docx文件
$text->setDocx('./1.docx');
// 将内容存入$docx变量中
$docx = $text->extract();
// 调试输出
var_dump($docx);

小结

代码中处理docx的类来自这里
其实docx就是xml的一种扩展类型的文档.