ThisTest/translators/The Boston Globe.js

293 lines
8.4 KiB
JavaScript
Raw Normal View History

2022-03-23 12:58:01 +01:00
{
"translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
"label": "The Boston Globe",
"creator": "Adam Crymble, Frank Bennett, Sebastian Karcher",
"target": "^https?://(www|search|articles|archive)\\.boston\\.com/",
"minVersion": "2.1.9",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2017-06-29 18:49:20"
}
/*
* Sample URLs
*
* [Original request -- uncommon page format, no embedded metadata of any kind]
* http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
*
* [More common page formats, marginally reliable metadata in a comment block]
* http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
* http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
* http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
* Support for search results will require rewriting scrape(..) to use only regular expressions
*/
function detectWeb(doc, url) {
if (url.match("search.boston.com")) {
// Search disabled until cross-domain can be dealt with
return false;
var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null);
if (results.iterateNext()) {
return "multiple";
} else {
return false;
}
} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
return "newspaperArticle";
}
}
//Boston Globe and Boston.com Translator. Original code by Adam Crymble
// Rewritten by Frank Bennett, 2011
function sniffComment (elem) {
if (!elem) {
return elem;
}
for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
if (elem.childNodes[i].nodeName === "#comment") {
return elem.childNodes[i].nodeValue;
}
}
return false;
}
function findMagicComment (doc) {
var hideMeElems = doc.getElementsByClassName("hideMe");
for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
var elem = hideMeElems.item(i);
var sniff = sniffComment(elem);
if (sniff) {
return sniff;
}
}
var contentElem = doc.getElementById("content");
return sniffComment(contentElem);
}
function findAuthorString (doc, newItem) {
var authors = "";
var bylineElem = false;
var bylineElems = doc.getElementsByClassName("byline");
if (bylineElems.length) {
bylineElem = bylineElems.item(0);
}
if (!bylineElem) {
var bylineElem = doc.getElementById('byline');
}
if (bylineElem) {
authors = bylineElem.textContent;
authors = authors.replace(/\n/g, " ");
if (authors.match(/[Pp]osted\s+by\s+/)) {
newItem.itemType = "blogPost";
}
authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1");
}
return authors;
}
function scrape (doc, url) {
// The site content is pretty chaotic, we do our best.
// There are two independent blocks set-and-save blocks
// below.
// Many pages seem to have metadata embedded in a comment
// The date and headline info look reliable, but
// the byline is a disaster, to be used only
// if absolutely necessary.
var magicComment = findMagicComment(doc);
if (magicComment) {
// Blind acceptance
var newItem =new Zotero.Item("newspaperArticle");
newItem.publicationTitle = "Boston.com";
// URL
newItem.url = doc.location.href;
// Attachment
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
// Now try to get some citation details (go ahead, try)
var info = magicComment.replace('\n','','g');
newItem.title = ZU.xpathText(doc, '//div[@id="headTools"]/h1');
newItem.date = ZU.xpathText(doc, '//span[@id="dateline"]/text()[2]');
var authors = findAuthorString(doc, newItem);
if (!authors) {
var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1");
if (authors.toLowerCase() === authors) {
authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1");
var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/);
if (m) {
authors = m[1];
} else {
authors = "";
}
}
}
authors = authors.split(/,*\s+and\s+/);
authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
authors = authors.join(", ");
authors = authors.split(/,\s+/);
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
if (author.lastName) {
newItem.creators.push(author);
}
}
newItem.complete();
}
// Information block
var infoElem = doc.getElementById("mod-article-byline");
if (infoElem) {
var newItem = new Zotero.Item("newspaperArticle");
newItem.publicationTitle = "Boston.com";
// URL
newItem.url = doc.location.href;
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
// Date
var dateElem = infoElem.getElementsByClassName('pubdate');
if (dateElem.length) {
newItem.date = dateElem.textContent;
}
// Authors
/*
for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
var node = infoElem.childNodes.item(i);
if (node.nodeName === 'SPAN') {
if ('By' === node.textContent.slice(0,2)) {
var authors = node.textContent.slice(3);
authors = authors.split(/(?:, |,*\s+and\s+)/);
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
newItem.creators.push(author);
}
}
}
}*/
var authors = ZU.xpathText(infoElem, './span[@class="separator"]/following-sibling::span')
authors = authors.replace(/^\s*[Bb]y|,.+?$/g, "").trim();
author = authors.split(/ and |\s*,\s*/)
for (var i in author){
newItem.creators.push(ZU.cleanAuthor(author[i], "author"));
}
// Title
var headerElem = doc.getElementById('mod-article-header');
if (headerElem) {
var h = headerElem.getElementsByTagName('h1');
if (h.length) {
newItem.title = h[0].textContent;
}
}
newItem.complete();
}
}
function doWeb (doc, url) {
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = {};
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null);
var elmt = result.iterateNext();
while (elmt) {
//items.push(elmt.href);
items[elmt.href] = elmt.textContent;
elmt = result.iterateNext();
}
Zotero.selectItems(items, function (items) {
if (!items) {
return true;
}
for (var i in items) {
articles.push(i);
}
ZU.processDocuments(articles, scrape);
});
} else {
scrape(doc, url);
}
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://archive.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/?camp=pm",
"items": [
{
"itemType": "newspaperArticle",
"title": "Anticipation grows for MFAs spring flower festival",
"creators": [
{
"firstName": "Carol",
"lastName": "Stocker",
"creatorType": "author"
}
],
"date": "April 28, 2011",
"libraryCatalog": "The Boston Globe",
"publicationTitle": "Boston.com",
"url": "http://archive.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/?camp=pm",
"attachments": [
{
"mimetype": "text/html",
"snapshot": true,
"title": "Boston.com page"
}
],
"tags": [],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "http://archive.boston.com/news/nation/washington/articles/2011/05/08/bin_laden_occupied_shrunken_dark_world/",
"items": [
{
"itemType": "newspaperArticle",
"title": "A peek inside bin Ladens world: isolation, vanity, power",
"creators": [
{
"firstName": "Elisabeth",
"lastName": "Bumiller",
"creatorType": "author"
},
{
"firstName": "Carlotta",
"lastName": "Gall",
"creatorType": "author"
}
],
"date": "May 8, 2011",
"libraryCatalog": "The Boston Globe",
"publicationTitle": "Boston.com",
"shortTitle": "A peek inside bin Ladens world",
"url": "http://archive.boston.com/news/nation/washington/articles/2011/05/08/bin_laden_occupied_shrunken_dark_world/",
"attachments": [
{
"mimetype": "text/html",
"snapshot": true,
"title": "Boston.com page"
}
],
"tags": [],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/