ThisTest/translators/Google Scholar.js

968 lines
27 KiB
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"translatorID": "57a00950-f0d1-4b41-b6ba-44ff0fc30289",
"label": "Google Scholar",
"creator": "Simon Kornblith, Frank Bennett, Aurimas Vinckevicius",
"target": "^https?://scholar[-.]google[-.](com|cat|(com?[-.])?[a-z]{2})(\\.[^/]+)?/(scholar(_case)?\\?|citations\\?)",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2018-11-16 05:56:18"
// attr()/text() v2
function attr(docOrElem,selector,attr,index){var elem=index?docOrElem.querySelectorAll(selector).item(index):docOrElem.querySelector(selector);return elem?elem.getAttribute(attr):null;}function text(docOrElem,selector,index){var elem=index?docOrElem.querySelectorAll(selector).item(index):docOrElem.querySelector(selector);return elem?elem.textContent:null;}
function detectWeb(doc, url) {
/* Detection for law cases, but not "How cited" pages,
* e.g. url of "how cited" page:
if (url.indexOf('/scholar_case?') != -1
&& url.indexOf('about=') == -1
) {
return "case";
} else if (url.indexOf('/citations?') != -1) {
if (getProfileResults(doc, true)) {
return "multiple";
//individual saved citation
var link = ZU.xpathText(doc, '//a[@class="gsc_vcd_title_link"]/@href');
if (!link) return;
if (link.indexOf('/scholar_case?') != -1) {
return 'case';
} else {
//Can't distinguish book from journalArticle
//Both have "Journal" fields
return 'journalArticle';
} else if (getSearchResults(doc, true)) {
return "multiple";
function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll('.gs_r[data-cid]');
for (var i=0; i<rows.length; i++) {
var href = rows[i].dataset.cid;
var title = text(rows[i], '.gs_rt');
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
return found ? items : false;
function getProfileResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll('a.gsc_a_at');
for (var i=0; i<rows.length; i++) {
var href = rows[i].dataset.href;
var title = rows[i].textContent;
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
return found ? items : false;
function doWeb(doc, url) {
var type = detectWeb(doc, url);
if (type == "multiple") {
if (getSearchResults(doc, true)) {
Zotero.selectItems(getSearchResults(doc, false), function (items) {
if (!items) {
return true;
var ids = [];
for (var i in items) {
//here it is enough to know the ids and we can call scrape directly
scrape(doc, ids);
} else if (getProfileResults(doc, true)) {
Zotero.selectItems(getProfileResults(doc, false), function (items) {
if (!items) {
return true;
var articles = [];
for (var i in items) {
//here we need open these pages before calling scrape
ZU.processDocuments(articles, scrape);
} else {
scrape(doc, url, type);
function scrape(doc, idsOrUrl, type) {
if (Array.isArray(idsOrUrl)) {
scrapeIds(doc, idsOrUrl);
} else {
if (type && type=="case") {
scrapeCase(doc, idsOrUrl);
} else {
var related = ZU.xpathText(doc, '//a[contains(@href, "q=related:")]/@href');
if (!related) {
throw new Error("Could not locate related URL");
var itemID = related.match(/=related:([^:]+):/);
if (itemID) {
scrapeIds(doc, [itemID[1]]);
} else {
Z.debug("Can't find itemID. related URL is " + related);
throw new Error("Cannot extract itemID from related link");
function scrapeIds(doc, ids) {
for (let i=0; i<ids.length; i++) {
// We need here 'let' to access ids[i] later in the nested functions
let context = doc.querySelector('.gs_r[data-cid="' + ids[i] + '"]');
if (!context && ids.length==1) context = doc;
var citeUrl = '/scholar?q=info:' + ids[i] + '';
// For 'My Library' we check the search field at the top
// and then in these cases change the citeUrl accordingly.
var scilib = attr(doc, '#gs_hdr_frm input[name="scilib"]', 'value')
if (scilib && scilib==1) {
var citeUrl = '/scholar?scila=' + ids[i] + '&output=cite&scirp=1';
ZU.doGet(citeUrl, function(citePage) {
var m = citePage.match(/href="((https?:\/\/[a-z\.]*)?\/scholar.bib\?[^"]+)/);
if (!m) {
//Saved lists and possibly other places have different formats for BibTeX URLs
//Trying to catch them here (can't add test bc lists are tied to google accounts)
m = citePage.match(/href="(.+?)">BibTeX<\/a>/);
if (!m) {
var msg = "Could not find BibTeX URL";
var title = citePage.match(/<title>(.*?)<\/title>/i);
if (title) {
if (title) msg += ' Got page with title "' + title[1] +'"';
throw new Error(msg);
var bibUrl = ZU.unescapeHTML(m[1]);
ZU.doGet(bibUrl, function(bibtex) {
var translator = Zotero.loadTranslator("import");
translator.setHandler("itemDone", function(obj, item) {
//these two variables are extracted from the context
var titleLink = attr(context, 'h3 a, #gsc_vcd_title a', 'href');
var secondLine = text(context, '.gs_a') || '';
//case are not recognized and can be characterized by the
//titleLink, or that the second line starts with a number
//e.g. 1 Cr. 137 - Supreme Court, 1803
if ((titleLink && titleLink.indexOf('/scholar_case?')>-1) ||
secondLine && ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'].indexOf(secondLine[0])>-1) {
item.itemType = "case";
item.caseName = item.title;
item.reporter = item.publicationTitle;
item.reporterVolume = item.volume;
item.dateDecided =;
item.court = item.publisher;
//patents are not recognized but are easily detected
//by the titleLink or second line
if ((titleLink && titleLink.indexOf('')>-1) || secondLine.indexOf('Google Patents')>-1) {
item.itemType = "patent";
//authors are inventors
for (var i=0, n=item.creators.length; i<n; i++) {
item.creators[i].creatorType = 'inventor';
//country and patent number
if (titleLink) {
let m = titleLink.match(/\/patents\/([A-Za-z]+)(.*)$/);
if (m) { = m[1];
item.patentNumber = m[2];
//fix titles in all upper case, e.g. some patents in search results
if (item.title.toUpperCase() == item.title) {
item.title = ZU.capitalizeTitle(item.title);
//delete "others" as author
if (item.creators.length) {
var lastCreatorIndex = item.creators.length-1,
lastCreator = item.creators[lastCreatorIndex];
if (lastCreator.lastName === "others" && (lastCreator.fieldMode === 1 ||lastCreator.firstName === "")) {
item.creators.splice(lastCreatorIndex, 1);
//clean author names
for (var j=0, m=item.creators.length; j<m; j++) {
if (!item.creators[j].firstName) continue;
item.creators[j] = ZU.cleanAuthor(
item.creators[j].lastName + ', ' +
//attach linked document as attachment if available
var documentLinkTarget = attr(context, '.gs_or_ggsm a, #gsc_vcd_title_gg a', 'href');
var documentLinkTitle = text(context, '.gs_or_ggsm a, #gsc_vcd_title_gg a');
if (documentLinkTarget) {
attachment = {
title: "Full Text",
url: documentLinkTarget
var m = documentLinkTitle.match(/^\[(\w+)\]/);
if (m) {
var mimeTypes = {
'PDF': 'application/pdf',
'DOC': 'application/msword',
'HTML': 'text/html'
if (Object.keys(mimeTypes).indexOf(m[1].toUpperCase())>-1) {
attachment.mimeType = mimeTypes[m[1]];
// Attach linked page as snapshot if available
if (titleLink && titleLink != documentLinkTarget) {
url: titleLink,
title: "Snapshot",
mimeType: "text/html"
* #########################
* ### Scraper Functions ###
* #########################
var bogusItemID = 1;
var scrapeCase = function (doc, url) {
// Citelet is identified by
// id="gsl_reference"
var refFrag = doc.evaluate('//div[@id="gsl_reference"] | //div[@id="gs_reference"]',
doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (refFrag) {
// citelet looks kind of like this
// Powell v. McCormack, 395 US 486 - Supreme Court 1969
var item = new Zotero.Item("case");
var attachmentPointer = url;
if (Zotero.isMLZ) {
var block = doc.getElementById("gs_opinion_wrapper");
if (block) {
attachmentPointer = block;
var factory = new ItemFactory(doc, refFrag.textContent, [attachmentPointer]);
if (!factory.hasReporter()) {
// Look for docket number in the current document
* ####################
* ### Item Factory ###
* ####################
var ItemFactory = function (doc, citeletString, attachmentLinks, titleString /*, bibtexLink*/) {
// var strings
this.v = {};
this.v.title = titleString;
this.v.number = false;
this.v.court = false;
this.v.extra = false; = undefined;
this.v.jurisdiction = false;
this.v.docketNumber = false;
this.vv = {};
this.vv.volRepPag = [];
// portable array
this.attachmentLinks = attachmentLinks;
this.doc = doc;
// working strings
this.citelet = citeletString;
/** handled outside of item factory
this.bibtexLink = bibtexLink;
this.bibtexData = undefined;
this.trailingInfo = false;
// simple arrays of strings
this.hyphenSplit = false;
this.commaSplit = false;
ItemFactory.prototype.repairCitelet = function () {
if (!this.citelet.match(/\s+-\s+/)) {
this.citelet = this.citelet.replace(/,\s+([A-Z][a-z]+:)/, " - $1");
ItemFactory.prototype.repairTitle = function () {
// All-caps words of four or more characters probably need fixing.
if (this.v.title.match(/(?:[^a-z]|^)[A-Z]{4,}(?:[^a-z]|$)/)) {
this.v.title = ZU.capitalizeTitle(this.v.title.toLowerCase(), true)
.replace(/([^0-9a-z])V([^0-9a-z])/, "$1v$2");
ItemFactory.prototype.hasUsefulData = function () {
if (this.getDate()) {
return true;
if (this.hasInitials()) {
return true;
return false;
ItemFactory.prototype.hasInitials = function () {
if (this.hyphenSplit.length && this.hyphenSplit[0].match(/[A-Z] /)) {
return true;
return false;
ItemFactory.prototype.hasReporter = function () {
if (this.vv.volRepPag.length > 0) {
return true;
return false;
ItemFactory.prototype.getDate = function () {
var i, m;
// Citelet parsing, step (1)
if (!this.hyphenSplit) {
if (this.citelet.match(/\s+-\s+/)) {
this.hyphenSplit = this.citelet.split(/\s+-\s+/);
} else {
m = this.citelet.match(/^(.*),\s+([^,]+Court,\s+[^,]+)$/);
if (m) {
this.hyphenSplit = [m[1], m[2]];
} else {
this.hyphenSplit = [this.citelet];
this.trailingInfo = this.hyphenSplit.slice(-1);
if (! && !== false) { = false;
for (i = this.hyphenSplit.length - 1; i > -1; i += -1) {
m = this.hyphenSplit[i].match(/(?:(.*)\s+)*([0-9]{4})$/);
if (m) { = m[2];
if (m[1]) {
this.hyphenSplit[i] = m[1];
} else {
this.hyphenSplit[i] = "";
this.hyphenSplit = this.hyphenSplit.slice(0, i + 1);
// If we can find a more specific date in the case's centered text then use it
var nodesSnapshot = this.doc.evaluate('//div[@id="gs_opinion"]/center', this.doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null );
for ( var iNode = 0; iNode < nodesSnapshot.snapshotLength; iNode++ ) {
var specificDate = nodesSnapshot.snapshotItem(iNode).textContent.trim();
// Remove the first word through the first space
// if it starts with "Deci" or it doesn't start with the first three letters of a month
// and if it doesn't start with Submitted or Argued
// (So, words like "Decided", "Dated", and "Released" will be removed)
specificDate = specificDate.replace(/^(?:Deci|(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Submitted|Argued))[a-z]+[.:]?\s*/i,"")
// Remove the trailing period, if it is there
// If the remaining text is a valid date...
if (!isNaN(Date.parse(specificDate))) {
// ...then use it = specificDate;
ItemFactory.prototype.getCourt = function () {
var s, m;
// Citelet parsing, step (2)
s = this.hyphenSplit.pop().replace(/,\s*$/, "").replace(/\u2026\s*$/, "Court");
var court = null;
var jurisdiction = null;
m = s.match(/(.* Court),\s+(.*)/);
if (m) {
court = m[1];
jurisdiction = m[2];
if (!court) {
m = s.match(/(?:([a-zA-Z]+):\s*)*(.*)/);
if (m) {
court = m[2].replace(/_/g, " ");
jurisdiction = m[1];
if (court) {
this.v.court = court;
if (jurisdiction) {
this.v.extra = "{:jurisdiction: " + jurisdiction + "}";
ItemFactory.prototype.getVolRepPag = function () {
var i, m;
// Citelet parsing, step (3)
if (this.hyphenSplit.length) {
this.commaSplit = this.hyphenSplit.slice(-1)[0].split(/\s*,\s+/);
var gotOne = false;
for (i = this.commaSplit.length - 1; i > -1; i += -1) {
m = this.commaSplit[i].match(/^([0-9]+)\s+(.*)\s+(.*)/);
if (m) {
var volRepPag = {};
volRepPag.volume = m[1];
volRepPag.reporter = m[2];
volRepPag.pages = m[3].replace(/\s*$/, "");
if (!volRepPag.pages.match(/[0-9]$/) && (i > 0 || gotOne)) {
gotOne = true;
} else {
ItemFactory.prototype.getTitle = function () {
// Citelet parsing, step (4) [optional]
if (this.commaSplit) {
this.v.title = this.commaSplit.join(", ");
ItemFactory.prototype.getDocketNumber = function (doc) {
var docNumFrag = doc.evaluate(
| //div[@class="gsc_value" and preceding-sibling::div[text()="Docket id"]]',
doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (docNumFrag) {
this.v.docketNumber = docNumFrag.textContent
.replace(/^\s*[Nn][Oo](?:.|\s+)\s*/, "")
.replace(/\.\s*$/, "");
ItemFactory.prototype.getAttachments = function (doctype) {
var i, ilen, attachments;
var attachmentTitle = "Google Scholar " + doctype;
attachments = [];
for (i = 0, ilen = this.attachmentLinks.length; i < ilen; i += 1) {
if (!this.attachmentLinks[i]) continue;
if ("string" === typeof this.attachmentLinks[i]) {
title: attachmentTitle,
} else {
// DOM fragment and parent doc
var block = this.attachmentLinks[i];
var doc = block.ownerDocument;
// String content (title, url, css)
var title = doc.getElementsByTagName("title")[0].textContent;
var url = doc.documentURI;
var css = "*{margin:0;padding:0;}div.mlz-outer{width: 60em;margin:0 auto;text-align:left;}body{text-align:center;}p{margin-top:0.75em;margin-bottom:0.75em;}div.mlz-link-button a{text-decoration:none;background:#cccccc;color:white;border-radius:1em;font-family:sans;padding:0.2em 0.8em 0.2em 0.8em;}div.mlz-link-button a:hover{background:#bbbbbb;}div.mlz-link-button{margin: 0.7em 0 0.8em 0;}";
// head element
var head = doc.createElement("head");
head.innerHTML = '<title>' + title + '</title>';
head.innerHTML += '<style type="text/css">' + css + '</style>';
var attachmentdoc = Zotero.Utilities.composeDoc(doc, head, block);
title: attachmentTitle,
// URL for this item
this.item.url = url;
return attachments;
ItemFactory.prototype.pushAttachments = function (doctype) {
this.item.attachments = this.getAttachments(doctype);
ItemFactory.prototype.getBibtexData = function (callback) {
if (!this.bibtexData) {
if (this.bibtexData !== false) {
Zotero.Utilities.doGet(this.bibtexLink, function(bibtexData) {
if (!bibtexData.match(/title={{}}/)) {
this.bibtexData = bibtexData;
} else {
this.bibtexData = false;
ItemFactory.prototype.saveItem = function () {
var i, ilen, key;
if (this.v.title) {
if (this.vv.volRepPag.length) {
var completed_items = [];
for (i = 0, ilen = this.vv.volRepPag.length; i < ilen; i += 1) {
this.item = new Zotero.Item("case");
for (key in this.vv.volRepPag[i]) {
if (this.vv.volRepPag[i][key]) {
this.item[key] = this.vv.volRepPag[i][key];
if (i === (this.vv.volRepPag.length - 1)) {
this.item.itemID = "" + bogusItemID;
bogusItemID += 1;
if (completed_items.length === 0) {
throw new Error("Failed to parse \"" + this.citelet + "\"");
for (i = 0, ilen = completed_items.length; i < ilen; i += 1) {
for (j = 0, jlen = completed_items.length; j < jlen; j += 1) {
if (i === j) {
} else {
this.item = new Zotero.Item("case");
} else {
throw new Error("Failed to find title in \"" + this.citelet + "\"");
ItemFactory.prototype.saveItemCommonVars = function () {
for (key in this.v) {
if (this.v[key]) {
this.item[key] = this.v[key];
Test Case Descriptions: (these have not been included in the test case JSON below as per
aurimasv's comment on
"description": "Legacy test case",
"url": "",
"description": "Legacy test case",
"url": "",
"description": "Legacy test case",
"url": "",
"description": "Legacy test case",
"url": "",
"description": "Legacy test case",
"url": "",
"description": "Legacy test case",
"url": ",5",
"description": "Decided date not preceded by any word or any other date line",
"url": "",
"description": "Decided date preceded by 'Dated'",
"url": "",
"description": "Decided date preceded by 'Released'",
"url": "",
"description": "Decided date preceded by 'Decided' and also by a 'Submitted' date line",
"url": "",
"description": "Decided date preceded by 'Decided' and also by an 'Argued' date line",
"url": "",
"description": "Decided date preceded by 'Decided' and also by an 'Argued' date line and followed by an 'As Modified' line; most citers of this case appear to use the Decided date, not the As Modified date",
"url": "",
var testCases = [
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": ",5",
"items": [
"itemType": "case",
"caseName": "Marbury v. Madison",
"creators": [],
"dateDecided": "1803",
"court": "Supreme Court",
"firstPage": "137",
"itemID": "1",
"reporter": "US",
"reporterVolume": "5",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Meier ex rel. Meier v. Sun Intern. Hotels, Ltd.",
"creators": [],
"dateDecided": "April 19, 2002",
"court": "Court of Appeals, 11th Circuit",
"firstPage": "1264",
"itemID": "1",
"reporter": "F. 3d",
"reporterVolume": "288",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Patio Enclosures, Inc. v. Four Seasons Marketing Corp.",
"creators": [],
"dateDecided": "September 21, 2005",
"court": "Court of Appeals, 9th Appellate Dist.",
"extra": "{:jurisdiction: Ohio}",
"firstPage": "4933",
"itemID": "1",
"reporter": "Ohio",
"reporterVolume": "2005",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Click v. Estate of Click",
"creators": [],
"dateDecided": "June 13, 2007",
"court": "Court of Appeals, 4th Appellate Dist.",
"extra": "{:jurisdiction: Ohio}",
"firstPage": "3029",
"itemID": "1",
"reporter": "Ohio",
"reporterVolume": "2007",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Kenty v. Transamerica Premium Ins. Co.",
"creators": [],
"dateDecided": "July 5, 1995",
"court": "Supreme Court",
"extra": "{:jurisdiction: Ohio}",
"firstPage": "415",
"itemID": "1",
"reporter": "Ohio St. 3d",
"reporterVolume": "72",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Tinker v. Des Moines Independent Community School Dist.",
"creators": [],
"dateDecided": "February 24, 1969",
"court": "Supreme Court",
"firstPage": "503",
"itemID": "1",
"reporter": "US",
"reporterVolume": "393",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Kaimowitz v. Board of Trustees of U. of Illinois",
"creators": [],
"dateDecided": "December 23, 1991",
"court": "Court of Appeals, 7th Circuit",
"firstPage": "765",
"itemID": "1",
"reporter": "F. 2d",
"reporterVolume": "951",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "case",
"caseName": "Kline v. Mortgage Electronic Security Systems",
"creators": [],
"dateDecided": "February 27, 2013",
"court": "Dist. Court",
"docketNumber": "Case No. 3:08cv408",
"extra": "{:jurisdiction: SD Ohio}",
"attachments": [
"title": "Google Scholar Judgement",
"type": "text/html"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": [
"itemType": "journalArticle",
"title": "Linked data-the story so far",
"creators": [
"firstName": "Christian",
"lastName": "Bizer",
"creatorType": "author"
"firstName": "Tom",
"lastName": "Heath",
"creatorType": "author"
"firstName": "Tim",
"lastName": "Berners-Lee",
"creatorType": "author"
"date": "2009",
"itemID": "bizer2009linked",
"libraryCatalog": "Google Scholar",
"pages": "205227",
"publicationTitle": "Semantic services, interoperability and web applications: emerging concepts",
"attachments": [
"title": "Snapshot"
"title": "Fulltext",
"mimeType": "application/pdf"
"tags": [],
"notes": [],
"seeAlso": []
"type": "web",
"url": "",
"items": "multiple"
"type": "web",
"url": ",5&as_vis=1&q=%22transformative+works+and+cultures%22&scisbd=1",
"items": "multiple"