js
(function() {
'use strict';
const doc = Application.ActiveDocument;
console.log('开始解析文档:', doc.Name);
// ========== 1. 通过API获取所有段落实时信息 ==========
const apiParagraphs = [];
for (let i = 1; i <= doc.Paragraphs.Count; i++) {
const para = doc.Paragraphs.Item(i);
const paraIdDecimal = para.ParaID; // 十进制ID
const paraIdHex = paraIdDecimal?.toString(16).toUpperCase().padStart(8, '0'); // 转十六进制
apiParagraphs.push({
index: i,
paraIdDecimal: paraIdDecimal,
paraIdHex: paraIdHex,
text: para.Range.Text.replace(/[\r\n]/g, ' ').trim(),
start: para.Range.Start,
end: para.Range.End
});
}
console.log(`API获取到 ${apiParagraphs.length} 个段落`);
// ========== 2. 解析XML获取结构信息 ==========
const xmlString = doc.WordOpenXML;
const parser = new DOMParser();
const xmlDoc = parser.parseFromString(xmlString, 'application/xml');
// 只取 /word/document.xml
const parts = xmlDoc.getElementsByTagName('pkg:part');
let documentXml = null;
for (let i = 0; i < parts.length; i++) {
if (parts[i].getAttribute('pkg:name') === '/word/document.xml') {
const xmlData = parts[i].getElementsByTagName('pkg:xmlData')[0];
documentXml = xmlData.firstElementChild;
break;
}
}
if (!documentXml) {
console.error('找不到 /word/document.xml');
return;
}
// 解析XML中的结构信息
const xmlElements = [];
const processedParaIds = new Set();
// 先处理目录容器
const sdts = documentXml.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'sdt');
for (let i = 0; i < sdts.length; i++) {
const sdt = sdts[i];
// 检查是否是目录
const galleries = sdt.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'docPartGallery');
let isToc = false;
for (let j = 0; j < galleries.length; j++) {
if (galleries[j].getAttribute('w:val') === 'Table of Contents') {
isToc = true;
break;
}
}
if (isToc) {
xmlElements.push({
type: 'toc_container'
});
const content = sdt.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'sdtContent')[0];
if (content) {
const paragraphs = content.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'p');
for (let k = 0; k < paragraphs.length; k++) {
const p = paragraphs[k];
const paraIdHex = p.getAttributeNS('http://schemas.microsoft.com/office/word/2010/wordml', 'paraId');
if (paraIdHex) {
processedParaIds.add(paraIdHex);
const texts = p.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 't');
let text = '';
for (let l = 0; l < texts.length; l++) {
text += texts[l].textContent;
}
xmlElements.push({
type: 'toc_item',
paraIdHex: paraIdHex,
text: text.trim()
});
}
}
}
}
}
// 处理其他段落
const paragraphs = documentXml.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'p');
for (let i = 0; i < paragraphs.length; i++) {
const p = paragraphs[i];
const paraIdHex = p.getAttributeNS('http://schemas.microsoft.com/office/word/2010/wordml', 'paraId');
if (!paraIdHex || processedParaIds.has(paraIdHex)) continue;
const texts = p.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 't');
let text = '';
for (let j = 0; j < texts.length; j++) {
text += texts[j].textContent;
}
// 检查是否是标题
const outlineLvl = p.querySelector('w\\:outlineLvl, outlineLvl');
const isHeading = !!outlineLvl;
xmlElements.push({
type: isHeading ? 'heading' : 'paragraph',
paraIdHex: paraIdHex,
text: text.trim(),
level: isHeading ? outlineLvl?.getAttribute('w:val') : null
});
}
// ========== 3. 合并API和XML数据 ==========
// 创建十六进制ID到API数据的映射
const apiMap = {};
apiParagraphs.forEach(p => {
if (p.paraIdHex) {
apiMap[p.paraIdHex] = p;
}
});
// 合并数据
const mergedElements = xmlElements.map(el => {
if (el.paraIdHex && apiMap[el.paraIdHex]) {
const apiData = apiMap[el.paraIdHex];
return {
...el,
index: apiData.index,
start: apiData.start,
end: apiData.end,
text: apiData.text, // 使用API获取的文本(已处理换行符)
paraIdDecimal: apiData.paraIdDecimal
};
}
return el;
});
// ========== 4. 输出结果 ==========
console.log('\n=== 合并后的文档结构 ===');
console.log(`总元素数: ${mergedElements.length}`);
const stats = {
paragraph: mergedElements.filter(e => e.type === 'paragraph').length,
heading: mergedElements.filter(e => e.type === 'heading').length,
toc_container: mergedElements.filter(e => e.type === 'toc_container').length,
toc_item: mergedElements.filter(e => e.type === 'toc_item').length
};
console.log('统计:', stats);
console.log('\n--- 详细列表 ---');
mergedElements.forEach((el, i) => {
if (el.type === 'toc_container') {
console.log(`${i + 1}. [目录容器]`);
} else {
const idInfo = el.paraIdHex ?
`(hex:${el.paraIdHex} dec:${el.paraIdDecimal})` : '';
const rangeInfo = (el.start !== undefined && el.end !== undefined) ?
`range:[${el.start},${el.end}]` : '';
const indexInfo = el.index ? `index:${el.index}` : '';
console.log(`${i + 1}. [${el.type}] ${el.text || '(空)'} ${indexInfo} ${idInfo} ${rangeInfo}`);
}
});
// ========== 5. 返回完整数据 ==========
return {
api: apiParagraphs,
xml: xmlElements,
merged: mergedElements,
// 同时提供按ID快速查找的函数
findById: (hexId) => {
const decimal = parseInt(hexId, 16);
return {
api: apiParagraphs.find(p => p.paraIdHex === hexId),
xml: xmlElements.find(el => el.paraIdHex === hexId),
merged: mergedElements.find(el => el.paraIdHex === hexId)
};
}
};
})();