414 lines
12 KiB
JavaScript
414 lines
12 KiB
JavaScript
|
{
|
||
|
"translatorID": "8efcb7cb-4180-4555-969a-08e8b34066c4",
|
||
|
"label": "Trove",
|
||
|
"creator": "Tim Sherratt",
|
||
|
"target": "^https?://trove\\.nla\\.gov\\.au/(?:newspaper|gazette|work|book|article|picture|music|map|collection)/",
|
||
|
"minVersion": "3.0",
|
||
|
"maxVersion": "",
|
||
|
"priority": 100,
|
||
|
"inRepository": true,
|
||
|
"translatorType": 4,
|
||
|
"browserSupport": "gcsibv",
|
||
|
"lastUpdated": "2020-09-15 01:24:23"
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Trove Translator
|
||
|
Copyright (C) 2016 Tim Sherratt (tim@discontents.com.au, @wragge)
|
||
|
|
||
|
This program is free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU Affero General Public License as published by
|
||
|
the Free Software Foundation, either version 3 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU Affero General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU Affero General Public License
|
||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
|
||
|
function detectWeb(doc, url) {
|
||
|
// Note that the url for search results has changed,
|
||
|
// so the first pattern will never match.
|
||
|
// However, results scraping needs to be rewritten due to the redesign,
|
||
|
// so leave this as is for now.
|
||
|
if (url.includes('/result?') || url.includes('/newspaper/page')) {
|
||
|
return getSearchResults(doc, url, true) ? 'multiple' : false;
|
||
|
}
|
||
|
else if (url.includes('/newspaper/article')) {
|
||
|
return "newspaperArticle";
|
||
|
}
|
||
|
// Scraping from works is very brokened due to site redesign
|
||
|
// Prevent detection until a fix is available
|
||
|
// else if (url.includes('/work/')) {
|
||
|
// return "book";
|
||
|
// }
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
|
||
|
function getSearchResults(doc, url, checkOnly) {
|
||
|
var items = {};
|
||
|
var results;
|
||
|
var found = false;
|
||
|
if (url.includes('/result?')) {
|
||
|
results = ZU.xpath(doc, "//div[@id='mainresults']//li/dl/dt/a");
|
||
|
}
|
||
|
else {
|
||
|
results = ZU.xpath(doc, "//ol[@class='list-unstyled articles']/li/h4/a");
|
||
|
}
|
||
|
for (var i = 0; i < results.length; i++) {
|
||
|
var link = results[i].href;
|
||
|
var title = ZU.trimInternal(results[i].textContent);
|
||
|
if (!title || !link) continue;
|
||
|
if (checkOnly) return true;
|
||
|
found = true;
|
||
|
items[link] = title;
|
||
|
}
|
||
|
return found ? items : false;
|
||
|
}
|
||
|
|
||
|
|
||
|
function doWeb(doc, url) {
|
||
|
if (detectWeb(doc, url) == "multiple") {
|
||
|
Zotero.selectItems(getSearchResults(doc, url), function (items) {
|
||
|
if (!items) return;
|
||
|
|
||
|
for (var i in items) {
|
||
|
scrape(null, i);
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
else {
|
||
|
scrape(doc, url);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
function scrape(doc, url) {
|
||
|
if (url.includes('/newspaper/article/')) {
|
||
|
scrapeNewspaper(doc, url);
|
||
|
}
|
||
|
else {
|
||
|
scrapeWork(doc, url);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
function scrapeNewspaper(doc, url) {
|
||
|
var articleID = url.match(/newspaper\/article\/(\d+)/)[1];
|
||
|
var bibtexURL = "http://trove.nla.gov.au/newspaper/citations/bibtex-article-" + articleID + ".bibtex";
|
||
|
|
||
|
ZU.HTTP.doGet(bibtexURL, function (bibtex) {
|
||
|
var translator = Zotero.loadTranslator("import");
|
||
|
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
|
||
|
translator.setString(bibtex);
|
||
|
|
||
|
// Clean up the BibTex results and add some extra stuff.
|
||
|
translator.setHandler("itemDone", function (obj, item) {
|
||
|
item.itemType = 'newspaperArticle';
|
||
|
item.pages = item.numPages;
|
||
|
delete item.numPages;
|
||
|
delete item.type;
|
||
|
delete item.itemID;
|
||
|
|
||
|
// doc is null during multiple call
|
||
|
if (doc) {
|
||
|
item.abstractNote = ZU.xpathText(doc, "//meta[@property='og:description']/@content");
|
||
|
// Add tags
|
||
|
var tags = ZU.xpath(doc, "//ul[contains(@class,'nlaTagContainer')]/li");
|
||
|
for (let tag of tags) {
|
||
|
tag = ZU.xpathText(tag, "div/a[not(contains(@class,'anno-remove'))]");
|
||
|
item.tags.push(tag);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// I've created a proxy server to generate the PDF and return the URL without locking up the browser.
|
||
|
var proxyURL = "https://trove-proxy.herokuapp.com/pdf/" + articleID;
|
||
|
ZU.doGet(proxyURL, function (pdfURL) {
|
||
|
// With the last argument 'false' passed to doGet
|
||
|
// we allow all status codes to continue and reach
|
||
|
// the item.complete() command.
|
||
|
if (pdfURL.startsWith('http')) {
|
||
|
item.attachments.push({
|
||
|
url: pdfURL,
|
||
|
title: 'Trove newspaper PDF',
|
||
|
mimeType: 'application/pdf'
|
||
|
});
|
||
|
}
|
||
|
else {
|
||
|
Zotero.debug("No PDF because unexpected return from trove-proxy " + proxyURL);
|
||
|
Zotero.debug(pdfURL);
|
||
|
}
|
||
|
|
||
|
// Get the OCRd text and save in a note.
|
||
|
var textURL = "http://trove.nla.gov.au/newspaper/rendition/nla.news-article" + articleID + ".txt";
|
||
|
ZU.HTTP.doGet(textURL, function (text) {
|
||
|
item.notes.push({
|
||
|
note: text.trim()
|
||
|
});
|
||
|
item.complete();
|
||
|
});
|
||
|
}, null, null, null, false);
|
||
|
});
|
||
|
translator.translate();
|
||
|
});
|
||
|
}
|
||
|
|
||
|
|
||
|
var troveTypes = {
|
||
|
Book: "book",
|
||
|
"Article Article/Book chapter": "bookSection",
|
||
|
Thesis: "thesis",
|
||
|
"Archived website": "webpage",
|
||
|
"Conference Proceedings": "book",
|
||
|
"Audio book": "book",
|
||
|
Article: "journalArticle",
|
||
|
"Article Article/Journal or magazine article": "journalArticle",
|
||
|
"Article Article/Conference paper": "conferencePaper",
|
||
|
"Article Article/Report": "report",
|
||
|
Photograph: "artwork",
|
||
|
"Poster, chart, other": "artwork",
|
||
|
"Art work": "artwork",
|
||
|
Object: "artwork",
|
||
|
"Microform Photograph": "artwork",
|
||
|
"Microform Object": "artwork",
|
||
|
Sound: "audioRecording",
|
||
|
Video: "videoRecording",
|
||
|
"Printed music": "book",
|
||
|
Map: "map",
|
||
|
Unpublished: "manuscript",
|
||
|
Published: "document"
|
||
|
};
|
||
|
|
||
|
|
||
|
// The function ...
|
||
|
function checkType(string) {
|
||
|
var types = string.split("; ");
|
||
|
var newString;
|
||
|
while (types.length > 0) {
|
||
|
newString = types.join(" ");
|
||
|
if (troveTypes.hasOwnProperty(newString)) {
|
||
|
return troveTypes[newString];
|
||
|
}
|
||
|
types.pop();
|
||
|
}
|
||
|
return "book";
|
||
|
}
|
||
|
|
||
|
|
||
|
// Sometimes authors are a little messy and we need to clean them
|
||
|
// e.g. author = { Bayley, William A. (William Alan), 1910-1981 },
|
||
|
// results in
|
||
|
// "firstName": "1910-1981, William A. (William Alan)",
|
||
|
// "lastName": "Bayley"
|
||
|
function cleanCreators(creators) {
|
||
|
for (var i = 0; i < creators.length; i++) {
|
||
|
var name = creators[i].firstName;
|
||
|
name = name.replace(/\(?\d{4}-\d{0,4}\)?,?/, "").trim();
|
||
|
var posParenthesis = name.indexOf("(");
|
||
|
if (posParenthesis > -1) {
|
||
|
var first = name.substr(0, posParenthesis);
|
||
|
var second = name.substr(posParenthesis + 1, name.length - posParenthesis - 2);
|
||
|
if (second.includes(first.replace('.', '').trim())) {
|
||
|
name = second;
|
||
|
}
|
||
|
else {
|
||
|
name = first;
|
||
|
}
|
||
|
}
|
||
|
creators[i].firstName = name.trim();
|
||
|
}
|
||
|
return creators;
|
||
|
}
|
||
|
|
||
|
|
||
|
function scrapeWork(doc, url) {
|
||
|
var thumbnailURL;
|
||
|
|
||
|
// Remove all params from url
|
||
|
var workURL = url.replace(/[?#].*/, '');
|
||
|
var bibtexURL = workURL + '?citationFormat=BibTeX';
|
||
|
|
||
|
if (doc) {
|
||
|
// Need to get version identifier for the BibText url
|
||
|
var versionID = doc.body.innerHTML.match(/displayCiteDialog\('(.+?)'/);
|
||
|
if (versionID !== null) {
|
||
|
bibtexURL += '&selectedversion=' + versionID[1];
|
||
|
thumbnailURL = ZU.xpathText(doc, "//a/img[@class='mosaic ui-shdw']/@src");
|
||
|
}
|
||
|
else {
|
||
|
// It's a work -- so thumbnails are different
|
||
|
thumbnailURL = ZU.xpathText(doc, "//li[@class='imgfirst']//img/@src");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Get the BibTex and feed it to the translator.
|
||
|
ZU.HTTP.doGet(bibtexURL, function (bibtex) {
|
||
|
var translator = Zotero.loadTranslator("import");
|
||
|
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
|
||
|
translator.setString(bibtex);
|
||
|
translator.setHandler("itemDone", function (obj, item) {
|
||
|
item.itemType = checkType(item.type);
|
||
|
item.creators = cleanCreators(item.creators);
|
||
|
|
||
|
// Attach a link to the contributing repository if available
|
||
|
if (item.hasOwnProperty('url')) {
|
||
|
item.attachments.push({
|
||
|
title: "Record from contributing repository",
|
||
|
url: item.url,
|
||
|
mimeType: 'text/html',
|
||
|
snapshot: false
|
||
|
});
|
||
|
}
|
||
|
|
||
|
if (doc) {
|
||
|
// This gives a better version-aware url.
|
||
|
item.url = ZU.xpathText(doc, "//meta[@property='og:url']/@content");
|
||
|
item.abstractNote = ZU.xpathText(doc, "//meta[@property='og:description']/@content");
|
||
|
|
||
|
// Add tags
|
||
|
let tags = ZU.xpath(doc, "//div[@id='tagswork' or @id='content-tags']/ul/li");
|
||
|
for (var i = 0; i < tags.length; i++) {
|
||
|
let tag = ZU.xpathText(tags[i], "a");
|
||
|
item.tags.push(tag);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (thumbnailURL !== null) {
|
||
|
item.attachments.push({
|
||
|
url: thumbnailURL,
|
||
|
title: 'Trove thumbnail image',
|
||
|
mimeType: 'image/jpeg'
|
||
|
});
|
||
|
}
|
||
|
item.complete();
|
||
|
});
|
||
|
translator.translate();
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/** BEGIN TEST CASES **/
|
||
|
var testCases = [
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "https://trove.nla.gov.au/work/9958833?q&versionId=11567057",
|
||
|
"items": [
|
||
|
{
|
||
|
"itemType": "book",
|
||
|
"title": "Experiences of a meteorologist in South Australia",
|
||
|
"creators": [
|
||
|
{
|
||
|
"firstName": "Clement Lindley",
|
||
|
"lastName": "Wragge",
|
||
|
"creatorType": "author"
|
||
|
}
|
||
|
],
|
||
|
"date": "1980",
|
||
|
"ISBN": "9780908065073",
|
||
|
"abstractNote": "In 14 libraries. 24 p. : ill. ; 22 cm. Wragge, Clement L. (Clement Lindley), 1852-1922. South Australia. Climate, 1883-1884. Meteorologists -- South Australia -- Biography. South Australia -- Description and travel. South Australia -- Climate -- History.",
|
||
|
"itemID": "trove.nla.gov.au/work/9958833",
|
||
|
"language": "English",
|
||
|
"libraryCatalog": "Trove",
|
||
|
"publisher": "Warradale, S.Aust. : Pioneer Books",
|
||
|
"url": "https://trove.nla.gov.au/version/11567057",
|
||
|
"attachments": [],
|
||
|
"tags": [],
|
||
|
"notes": [
|
||
|
{
|
||
|
"note": "<p> Reprinted from Good words for 1887/ edited by Donald Macleod, published: London: Isbister and Co </p>"
|
||
|
}
|
||
|
],
|
||
|
"seeAlso": []
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "https://trove.nla.gov.au/newspaper/article/70068753",
|
||
|
"items": [
|
||
|
{
|
||
|
"itemType": "newspaperArticle",
|
||
|
"title": "'WRAGGE.'",
|
||
|
"creators": [],
|
||
|
"date": "7 Feb 1903",
|
||
|
"abstractNote": "We have received a copy of the above which is a journal devoted chiefly to the science of meteorology. It is owned and conducted by Mr. Clement ...",
|
||
|
"libraryCatalog": "Trove",
|
||
|
"pages": "4",
|
||
|
"place": "Vic.",
|
||
|
"publicationTitle": "Sunbury News (Vic. : 1900 - 1927)",
|
||
|
"url": "http://nla.gov.au/nla.news-article70068753",
|
||
|
"attachments": [
|
||
|
{
|
||
|
"title": "Trove newspaper PDF",
|
||
|
"mimeType": "application/pdf"
|
||
|
}
|
||
|
],
|
||
|
"tags": [
|
||
|
{
|
||
|
"tag": "Meteorology Journal - Clement Wragge"
|
||
|
}
|
||
|
],
|
||
|
"notes": [
|
||
|
{
|
||
|
"note": "<html>\n <head>\n <title>07 Feb 1903 - 'WRAGGE.'</title>\n </head>\n <body>\n <p>Sunbury News (Vic. : 1900 - 1927), Saturday 7 February 1903, page 4</p>\n <hr/>\n <div class='zone'><p>'WRAGGE' - we have received a copy of the above, which is a journal devoted chiefly to the science of meteorology. It is owned and conducted by Mr. Clement Wragge. </p></div>\n </body>\n</html>"
|
||
|
}
|
||
|
],
|
||
|
"seeAlso": []
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://trove.nla.gov.au/newspaper/result?l-australian=y&q=wragge",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "http://trove.nla.gov.au/book/result?l-australian=y&q=wragge",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "https://trove.nla.gov.au/newspaper/page/7013947",
|
||
|
"items": "multiple"
|
||
|
},
|
||
|
{
|
||
|
"type": "web",
|
||
|
"url": "https://trove.nla.gov.au/work/9531118?q&sort=holdings+desc&_=1483112824975&versionId=14744047",
|
||
|
"items": [
|
||
|
{
|
||
|
"itemType": "book",
|
||
|
"title": "Lithgow zig zag railway, Blue Mountains, New South Wales",
|
||
|
"creators": [
|
||
|
{
|
||
|
"firstName": "William Alan",
|
||
|
"lastName": "Bayley",
|
||
|
"creatorType": "author"
|
||
|
}
|
||
|
],
|
||
|
"date": "1969",
|
||
|
"abstractNote": "In 19 libraries. 40 p. : ill., map ; 22 cm. Great Zig Zag Railway (Lithgow, N.S.W.) Railroads -- Blue Mountains (N.S.W. : Mountains) Zig Zag Railway -- Lithgow, Australia. Railroads -- New South Wales. Railroads -- New South Wales -- Blue Mountains. Blue Mountains (N.S.W.)",
|
||
|
"itemID": "trove.nla.gov.au/work/9531118",
|
||
|
"language": "English",
|
||
|
"libraryCatalog": "Trove",
|
||
|
"publisher": "[Bulli, N.S.W. : Zig Zag Press",
|
||
|
"url": "https://trove.nla.gov.au/version/14744047",
|
||
|
"attachments": [],
|
||
|
"tags": [],
|
||
|
"notes": [
|
||
|
{
|
||
|
"note": "<p> Cover title </p>"
|
||
|
}
|
||
|
],
|
||
|
"seeAlso": []
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
]
|
||
|
/** END TEST CASES **/
|