头条采集单篇文案工具
发表于 ・ 视频
// ==UserScript==
// @name 头条单篇文章采集Word-纯文案版
// @namespace local.toutiao.word.text.only
// @version 7.0
// @description 单篇文章单Word,只采集标题和正文文案,不采集图片
// @match https://www.toutiao.com/*
// @grant GM_xmlhttpRequest
// @connect *
// ==/UserScript==
(function () {
'use strict';
const config = {
scrollTimes: 2,
scrollDelay: 1500,
maxArticles: 30,
requestTimeout: 12000
};
let logBox, startBtn, statusText;
const sleep = ms => new Promise(r => setTimeout(r, ms));
function safeFileName(name) {
return (name || '文章')
.replace(/[\\/:*?"<>|]/g, '')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 70) || '文章';
}
function escapeHtml(str) {
return String(str || '')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"');
}
function log(msg, type = 'info') {
const div = document.createElement('div');
div.className = `tt-log-item ${type}`;
div.textContent = `[${new Date().toLocaleTimeString()}] ${msg}`;
logBox.appendChild(div);
logBox.scrollTop = logBox.scrollHeight;
}
function setStatus(msg) {
statusText.textContent = msg;
}
function request(url, responseType = 'document') {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'GET',
url,
responseType,
timeout: config.requestTimeout,
headers: { referer: location.href },
onload: res => resolve(res),
onerror: () => reject(new Error('请求失败')),
ontimeout: () => reject(new Error('请求超时'))
});
});
}
function createUI() {
const style = document.createElement('style');
style.textContent = `
#tt-word-panel {
position: fixed; right: 22px; top: 90px; width: 390px; z-index: 999999;
background: linear-gradient(180deg,#fff5eb,#fff);
border: 1px solid #ff6a00; border-radius: 18px;
box-shadow: 0 12px 34px rgba(255,80,0,.25);
font-family: Arial,"Microsoft YaHei",sans-serif; overflow: hidden;
}
#tt-word-panel .tt-header {
background: linear-gradient(90deg,#ff5000,#ff8a00); color: #fff;
padding: 14px 16px; font-size: 17px; font-weight: 700;
display: flex; justify-content: space-between;
}
#tt-word-panel .tt-badge {
font-size: 12px; border: 1px solid rgba(255,255,255,.6);
border-radius: 999px; padding: 2px 8px;
}
#tt-word-panel .tt-body { padding: 14px; }
#tt-word-panel .tt-row {
display: grid; grid-template-columns: 1fr 1fr; gap: 8px; margin-bottom: 10px;
}
#tt-word-panel label { font-size: 12px; color: #666; display: block; margin-bottom: 4px; }
#tt-word-panel input {
width: 100%; box-sizing: border-box; border: 1px solid #ffd0ad;
border-radius: 10px; padding: 8px; outline: none;
}
#tt-word-panel .tt-status {
background: #fff0e3; color: #a33a00; border-radius: 10px;
padding: 9px; font-size: 13px; margin-bottom: 10px;
}
#tt-word-panel .tt-btn {
width: 100%; border: 0; border-radius: 999px; padding: 11px;
background: linear-gradient(90deg,#ff5000,#ff7a00);
color: #fff; font-size: 15px; font-weight: 700; cursor: pointer;
}
#tt-word-panel .tt-btn:disabled { opacity: .55; cursor: not-allowed; }
#tt-word-panel .tt-log {
margin-top: 12px; height: 260px; overflow-y: auto; background: #1f1f1f;
color: #eee; border-radius: 12px; padding: 10px; font-size: 12px; line-height: 1.6;
}
.tt-log-item.ok { color:#73ff8f; }
.tt-log-item.warn { color:#ffd166; }
.tt-log-item.err { color:#ff8a8a; }
`;
document.head.appendChild(style);
const panel = document.createElement('div');
panel.id = 'tt-word-panel';
panel.innerHTML = `
<div class="tt-header">
<span>淘宝风采集器</span>
<span class="tt-badge">纯文案</span>
</div>
<div class="tt-body">
<div class="tt-row">
<div>
<label>滚动次数</label>
<input id="tt-scroll-times" type="number" value="${config.scrollTimes}">
</div>
<div>
<label>最多文章</label>
<input id="tt-max-articles" type="number" value="${config.maxArticles}">
</div>
</div>
<div class="tt-row">
<div>
<label>请求超时ms</label>
<input id="tt-timeout" type="number" value="${config.requestTimeout}">
</div>
<div>
<label>模式</label>
<input value="只采集文案" disabled>
</div>
</div>
<div class="tt-status">状态:<span id="tt-status-text">等待开始</span></div>
<button id="tt-start-btn" class="tt-btn">开始逐篇导出Word</button>
<div id="tt-log" class="tt-log"></div>
</div>
`;
document.body.appendChild(panel);
logBox = panel.querySelector('#tt-log');
startBtn = panel.querySelector('#tt-start-btn');
statusText = panel.querySelector('#tt-status-text');
startBtn.onclick = run;
log('插件已加载:当前为纯文案模式。', 'ok');
}
async function autoScroll() {
for (let i = 1; i <= config.scrollTimes; i++) {
window.scrollTo(0, document.body.scrollHeight);
log(`滚动加载 ${i}/${config.scrollTimes}`);
await sleep(config.scrollDelay);
}
}
function collectLinks() {
return [...new Set(
[...document.querySelectorAll('a.title[href*="/article/"]')]
.map(a => new URL(a.href, location.origin).href)
.filter(url => /\/article\/\d+/.test(url))
)].slice(0, config.maxArticles);
}
function cleanText(t) {
return (t || '')
.replace(/\u00a0/g, ' ')
.replace(/\s+\n/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function parseArticle(doc, url) {
const title =
cleanText(doc.querySelector('h1')?.innerText) ||
cleanText(doc.querySelector('.article-title')?.innerText) ||
cleanText(doc.querySelector('[class*="title"]')?.innerText) ||
cleanText(doc.title) ||
'未命名文章';
const box =
doc.querySelector('article') ||
doc.querySelector('.article-content') ||
doc.querySelector('[class*="article-content"]') ||
doc.querySelector('[class*="syl-page-article"]') ||
doc.querySelector('[class*="content"]') ||
doc.body;
const paragraphs = [...box.querySelectorAll('p')]
.map(p => cleanText(p.innerText))
.filter(Boolean)
.filter(t => !/广告|打开App|展开全文|点击进入/.test(t));
return { url, title, paragraphs };
}
function saveWordTextOnly(article, index) {
const bodyHtml = article.paragraphs
.map(p => `<p>${escapeHtml(p)}</p>`)
.join('');
const html = `
<html>
<head>
<meta charset="utf-8">
<title>${escapeHtml(article.title)}</title>
<style>
body { font-family: "Microsoft YaHei", SimSun, Arial; font-size: 16px; line-height: 1.9; }
h1 { font-size: 24px; text-align: center; }
.url { color: #666; font-size: 12px; text-align: center; margin-bottom: 24px; }
p { text-indent: 2em; margin: 10px 0; }
</style>
</head>
<body>
<h1>${escapeHtml(article.title)}</h1>
<div class="url">${escapeHtml(article.url)}</div>
${bodyHtml}
</body>
</html>
`;
const blob = new Blob(['\ufeff', html], {
type: 'application/msword;charset=utf-8'
});
const fileName = `${String(index).padStart(2, '0')}-${safeFileName(article.title)}.doc`;
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = fileName;
document.body.appendChild(a);
a.click();
a.remove();
setTimeout(() => URL.revokeObjectURL(a.href), 1500);
log(`已下载:${fileName},正文 ${article.paragraphs.length} 段`, 'ok');
}
async function handleOneArticle(url, index, total) {
try {
setStatus(`采集文章 ${index}/${total}`);
log(`打开文章 ${index}/${total}`);
const res = await request(url, 'document');
const article = parseArticle(res.response, url);
if (!article.paragraphs.length) {
log(`正文为空,跳过:${article.title}`, 'warn');
return;
}
log(`采集成功:${article.title}`, 'ok');
log(`正文 ${article.paragraphs.length} 段,不采集图片。`, 'ok');
setStatus(`保存Word ${index}/${total}`);
saveWordTextOnly(article, index);
await sleep(800);
} catch (e) {
log(`第 ${index} 篇跳过:${e.message || e}`, 'err');
}
}
async function run() {
startBtn.disabled = true;
logBox.innerHTML = '';
config.scrollTimes = Number(document.querySelector('#tt-scroll-times').value) || 2;
config.maxArticles = Number(document.querySelector('#tt-max-articles').value) || 30;
config.requestTimeout = Number(document.querySelector('#tt-timeout').value) || 12000;
try {
setStatus('滚动加载列表');
await autoScroll();
const links = collectLinks();
log(`发现 ${links.length} 篇文章。`, links.length ? 'ok' : 'err');
if (!links.length) {
setStatus('没有找到文章');
return;
}
for (let i = 0; i < links.length; i++) {
await handleOneArticle(links[i], i + 1, links.length);
}
setStatus('全部完成');
log('全部处理完成。', 'ok');
} catch (e) {
setStatus('出错');
log(`运行出错:${e.message || e}`, 'err');
} finally {
startBtn.disabled = false;
}
}
createUI();
})();
发表评论:
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。