首页

wps加载项获取文档中的目录元素range下标

官方推荐的是使用 TablesOfContents ,但是这个事实上只能识别自动目录元素,而通过解析 WordOpenXML 我们可以精准的区分每个段落是干啥的,下面是区分目录元素段落和普通段落的js代码实例,我操这个问题卡了我好久终于解决了,金山办公他们自己人都不知道怎么弄,唉,得亏我找到了这个东西啊,他们的技术支持提供的也是非常有限
js
(function() { 'use strict'; const doc = Application.ActiveDocument; console.log('开始解析文档:', doc.Name); // ========== 1. 通过API获取所有段落实时信息 ========== const apiParagraphs = []; for (let i = 1; i <= doc.Paragraphs.Count; i++) { const para = doc.Paragraphs.Item(i); const paraIdDecimal = para.ParaID; // 十进制ID const paraIdHex = paraIdDecimal?.toString(16).toUpperCase().padStart(8, '0'); // 转十六进制 apiParagraphs.push({ index: i, paraIdDecimal: paraIdDecimal, paraIdHex: paraIdHex, text: para.Range.Text.replace(/[\r\n]/g, ' ').trim(), start: para.Range.Start, end: para.Range.End }); } console.log(`API获取到 ${apiParagraphs.length} 个段落`); // ========== 2. 解析XML获取结构信息 ========== const xmlString = doc.WordOpenXML; const parser = new DOMParser(); const xmlDoc = parser.parseFromString(xmlString, 'application/xml'); // 只取 /word/document.xml const parts = xmlDoc.getElementsByTagName('pkg:part'); let documentXml = null; for (let i = 0; i < parts.length; i++) { if (parts[i].getAttribute('pkg:name') === '/word/document.xml') { const xmlData = parts[i].getElementsByTagName('pkg:xmlData')[0]; documentXml = xmlData.firstElementChild; break; } } if (!documentXml) { console.error('找不到 /word/document.xml'); return; } // 解析XML中的结构信息 const xmlElements = []; const processedParaIds = new Set(); // 先处理目录容器 const sdts = documentXml.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'sdt'); for (let i = 0; i < sdts.length; i++) { const sdt = sdts[i]; // 检查是否是目录 const galleries = sdt.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'docPartGallery'); let isToc = false; for (let j = 0; j < galleries.length; j++) { if (galleries[j].getAttribute('w:val') === 'Table of Contents') { isToc = true; break; } } if (isToc) { xmlElements.push({ type: 'toc_container' }); const content = sdt.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'sdtContent')[0]; if (content) { const paragraphs = content.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'p'); for (let k = 0; k < paragraphs.length; k++) { const p = paragraphs[k]; const paraIdHex = p.getAttributeNS('http://schemas.microsoft.com/office/word/2010/wordml', 'paraId'); if (paraIdHex) { processedParaIds.add(paraIdHex); const texts = p.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 't'); let text = ''; for (let l = 0; l < texts.length; l++) { text += texts[l].textContent; } xmlElements.push({ type: 'toc_item', paraIdHex: paraIdHex, text: text.trim() }); } } } } } // 处理其他段落 const paragraphs = documentXml.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'p'); for (let i = 0; i < paragraphs.length; i++) { const p = paragraphs[i]; const paraIdHex = p.getAttributeNS('http://schemas.microsoft.com/office/word/2010/wordml', 'paraId'); if (!paraIdHex || processedParaIds.has(paraIdHex)) continue; const texts = p.getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 't'); let text = ''; for (let j = 0; j < texts.length; j++) { text += texts[j].textContent; } // 检查是否是标题 const outlineLvl = p.querySelector('w\\:outlineLvl, outlineLvl'); const isHeading = !!outlineLvl; xmlElements.push({ type: isHeading ? 'heading' : 'paragraph', paraIdHex: paraIdHex, text: text.trim(), level: isHeading ? outlineLvl?.getAttribute('w:val') : null }); } // ========== 3. 合并API和XML数据 ========== // 创建十六进制ID到API数据的映射 const apiMap = {}; apiParagraphs.forEach(p => { if (p.paraIdHex) { apiMap[p.paraIdHex] = p; } }); // 合并数据 const mergedElements = xmlElements.map(el => { if (el.paraIdHex && apiMap[el.paraIdHex]) { const apiData = apiMap[el.paraIdHex]; return { ...el, index: apiData.index, start: apiData.start, end: apiData.end, text: apiData.text, // 使用API获取的文本(已处理换行符) paraIdDecimal: apiData.paraIdDecimal }; } return el; }); // ========== 4. 输出结果 ========== console.log('\n=== 合并后的文档结构 ==='); console.log(`总元素数: ${mergedElements.length}`); const stats = { paragraph: mergedElements.filter(e => e.type === 'paragraph').length, heading: mergedElements.filter(e => e.type === 'heading').length, toc_container: mergedElements.filter(e => e.type === 'toc_container').length, toc_item: mergedElements.filter(e => e.type === 'toc_item').length }; console.log('统计:', stats); console.log('\n--- 详细列表 ---'); mergedElements.forEach((el, i) => { if (el.type === 'toc_container') { console.log(`${i + 1}. [目录容器]`); } else { const idInfo = el.paraIdHex ? `(hex:${el.paraIdHex} dec:${el.paraIdDecimal})` : ''; const rangeInfo = (el.start !== undefined && el.end !== undefined) ? `range:[${el.start},${el.end}]` : ''; const indexInfo = el.index ? `index:${el.index}` : ''; console.log(`${i + 1}. [${el.type}] ${el.text || '(空)'} ${indexInfo} ${idInfo} ${rangeInfo}`); } }); // ========== 5. 返回完整数据 ========== return { api: apiParagraphs, xml: xmlElements, merged: mergedElements, // 同时提供按ID快速查找的函数 findById: (hexId) => { const decimal = parseInt(hexId, 16); return { api: apiParagraphs.find(p => p.paraIdHex === hexId), xml: xmlElements.find(el => el.paraIdHex === hexId), merged: mergedElements.find(el => el.paraIdHex === hexId) }; } }; })();