Difference between revisions of "Module:Webarchive"
m (1 revision: Prunus laurocerasus) |
m (1 revision: Equisetum) |
||
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
--[[ ---------------------------------- | --[[ ---------------------------------- | ||
− | + | Lua module implementing the {{webarchive}} template. | |
− | + | A merger of the functionality of three templates: {{wayback}}, {{webcite}} and {{cite archives}} | |
− | + | ||
− | + | ]] | |
− | local | + | require('Module:No globals'); |
+ | local getArgs = require ('Module:Arguments').getArgs; | ||
+ | local this_page = mw.title.getCurrentTitle(); | ||
− | |||
− | + | --[[--------------------------< F O R W A R D D E C L A R A T I O N S >-------------------------------------- | |
+ | ]] | ||
− | + | local categories = {}; -- category names from ./data | |
+ | local err_warn_msgs = {}; -- error and warning messages from ./data | ||
+ | local excepted_pages = {}; | ||
+ | local prefixes = {}; -- service provider tail string prefixes from ./data | ||
+ | local services = {}; -- archive service provider data from ./data | ||
+ | local uncategorized_namespaces = {}; -- list of namespaces that we should not categorize | ||
+ | local uncategorized_subpages = {}; -- list of subpages that should not be categorized | ||
− | local | + | local ulx = {}; -- Associative array to hold template data |
+ | local track = {}; -- Associative array to hold tracking categories | ||
− | |||
− | |||
− | + | --[[--------------------------< G L O B A L C O N F I G U R A T I O N S E T T I N G S >-------------------- | |
− | + | ]] | |
− | + | local maxurls = 10; -- Max number of URLs allowed. | |
− | + | local tname = 'Webarchive' -- name of calling template. Change if template rename. | |
+ | local verifydates = 'yes' -- See documentation. Set "no" to disable. | ||
− | |||
− | + | --[[--------------------------< inlineError >----------------------- | |
− | + | Critical error. Render output completely in red. Add to tracking category. | |
− | |||
− | |||
− | |||
− | |||
− | + | ]] | |
+ | local function inlineError(arg, msg) | ||
+ | track[categories.error] = 1 | ||
+ | return '<span style="font-size:100%" class="error citation-comment">Error in webarchive template: Check <code style="color:inherit; border:inherit; padding:inherit;">|' .. arg .. '=</code> value. ' .. msg .. '</span>' | ||
end | end | ||
− | |||
− | + | --[[--------------------------< inlineRed >----------------------- | |
− | + | ||
+ | Render a text fragment in red, such as a warning as part of the final output. | ||
+ | Add tracking category. | ||
]] | ]] | ||
− | local function | + | local function inlineRed(msg, trackmsg) |
− | + | if trackmsg == "warning" then | |
− | + | track[categories.warning] = 1; | |
− | + | elseif trackmsg == "error" then | |
− | + | track[categories.error] = 1; | |
− | + | end | |
− | + | ||
− | + | return '<span style="font-size:100%" class="error citation-comment">' .. msg .. '</span>' | |
− | |||
− | |||
− | |||
− | |||
− | |||
end | end | ||
+ | |||
--[[--------------------------< base62 >----------------------- | --[[--------------------------< base62 >----------------------- | ||
− | + | Convert base-62 to base-10 | |
− | + | Credit: https://de.wikipedia.org/wiki/Modul:Expr | |
− | + | ]] | |
local function base62( value ) | local function base62( value ) | ||
+ | local r = 1 -- default return value is input value is malformed | ||
− | + | if value:match( "^%w+$" ) then -- value must only be in the set [0-9a-zA-Z] | |
+ | local n = #value -- number of characters in value | ||
+ | local k = 1 | ||
+ | local c | ||
+ | r = 0 | ||
+ | for i = n, 1, -1 do -- loop through all characters in value from ls digit to ms digit | ||
+ | c = value:byte( i, i ) | ||
+ | if c >= 48 and c <= 57 then -- character is digit 0-9 | ||
+ | c = c - 48 | ||
+ | elseif c >= 65 and c <= 90 then -- character is ascii a-z | ||
+ | c = c - 55 | ||
+ | else -- must be ascii A-Z | ||
+ | c = c - 61 | ||
+ | end | ||
+ | r = r + c * k -- accumulate this base62 character's value | ||
+ | k = k * 62 -- bump for next | ||
+ | end -- for i | ||
+ | end | ||
+ | return r | ||
+ | end | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
--[[--------------------------< tableLength >----------------------- | --[[--------------------------< tableLength >----------------------- | ||
− | + | Given a 1-D table, return number of elements | |
− | + | ]] | |
local function tableLength(T) | local function tableLength(T) | ||
− | + | local count = 0 | |
− | + | for _ in pairs(T) do count = count + 1 end | |
− | + | return count | |
end | end | ||
Line 113: | Line 113: | ||
--[[--------------------------< dateFormat >----------------------- | --[[--------------------------< dateFormat >----------------------- | ||
− | + | Given a date string, return its format: dmy, mdy, iso, ymd | |
− | + | If unable to determine return nil | |
− | + | ]] | |
local function dateFormat(date) | local function dateFormat(date) | ||
− | + | local patterns = { | |
− | + | ['iso'] = '(%d%d%d%d)%-%d%d%-%d%d', | |
+ | ['dmy'] = '%d%d? +%a+ +(%d%d%d%d)', | ||
+ | ['mdy'] = '%a+ %d%d?, +(%d%d%d%d)', | ||
+ | ['ymd'] = '(%d%d%d%d) +%a+ %d%d?', -- TODO: not mos compliant; delete? | ||
+ | }; | ||
− | + | local form, y; | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | for k, v in pairs (patterns) do -- loop through the patterns table | |
− | + | y = mw.ustring.match (date, v); -- looking for a match | |
− | + | if y then -- not nil when found | |
− | + | form = k; -- save that | |
− | + | break; -- and done | |
− | + | end | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
+ | return (y and (1900 < tonumber(y) and 2200 > tonumber(y))) and form; -- TODO: why 1900? shouldn't that be birth-of-intenet year? why 2200? shouldn't that be current year? | ||
end | end | ||
+ | |||
--[[--------------------------< makeDate >----------------------- | --[[--------------------------< makeDate >----------------------- | ||
− | + | Given a zero-padded 4-digit year, 2-digit month and 2-digit day, return a full date in df format | |
− | + | df = mdy, dmy, iso, ymd | |
+ | |||
+ | on entry, year, month, day are presumed to be correct for the date that they represent; all are required | ||
− | + | ]] | |
local function makeDate(year, month, day, df) | local function makeDate(year, month, day, df) | ||
+ | local format = { | ||
+ | ['dmy'] = 'j F Y', | ||
+ | ['mdy'] = 'F j, Y', | ||
+ | ['ymd'] = 'Y F j', | ||
+ | ['iso'] = 'Y-m-d', | ||
+ | }; | ||
+ | |||
+ | if not year or '' == year or not month or '' == month or not day or '' == day and format[df] then | ||
+ | return nil; | ||
+ | end | ||
+ | |||
+ | local date = table.concat ({year, month, day}, '-'); -- assemble iso format date | ||
+ | return mw.getContentLanguage():formatDate (format[df], date); | ||
+ | end | ||
− | |||
− | |||
− | |||
− | + | --[[--------------------------< I S _ V A L I D _ D A T E >---------------------------------------------------- | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | Returns true if date is after 31 December 1899 (why is 1900 the min year? shouldn't the internet's date-of-birth | |
− | + | be min year?), not after today's date, and represents a valid date (29 February 2017 is not a valid date). Applies | |
− | + | Gregorian leapyear rules. | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | all arguments are required | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
+ | ]] | ||
+ | |||
+ | local function is_valid_date (year, month, day) | ||
+ | local days_in_month = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; | ||
+ | local month_length; | ||
+ | local y, m, d; | ||
+ | local today = os.date ('*t'); -- fetch a table of current date parts | ||
+ | |||
+ | if not year or '' == year or not month or '' == month or not day or '' == day then | ||
+ | return false; -- something missing | ||
+ | end | ||
+ | |||
+ | y = tonumber (year); | ||
+ | m = tonumber (month); | ||
+ | d = tonumber (day); | ||
+ | |||
+ | if 1900 > y or today.year < y or 1 > m or 12 < m then -- year and month are within bounds TODO: 1900? | ||
+ | return false; | ||
+ | end | ||
+ | |||
+ | if (2==m) then -- if February | ||
+ | month_length = 28; -- then 28 days unless | ||
+ | if (0==(y%4) and (0~=(y%100) or 0==(y%400))) then -- is a leap year? | ||
+ | month_length = 29; -- if leap year then 29 days in February | ||
+ | end | ||
+ | else | ||
+ | month_length=days_in_month[m]; | ||
+ | end | ||
+ | |||
+ | if 1 > d or month_length < d then -- day is within bounds | ||
+ | return false; | ||
+ | end | ||
+ | -- here when date parts represent a valid date | ||
+ | return os.time({['year']=y, ['month']=m, ['day']=d, ['hour']=0}) <= os.time(); -- date at midnight must be less than or equal to current date/time | ||
end | end | ||
Line 208: | Line 214: | ||
--[[--------------------------< decodeWebciteDate >----------------------- | --[[--------------------------< decodeWebciteDate >----------------------- | ||
− | + | Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format | |
+ | |||
+ | ]] | ||
− | |||
local function decodeWebciteDate(path, df) | local function decodeWebciteDate(path, df) | ||
− | + | local dt = {} | |
− | |||
− | + | dt = mw.text.split(path, "/") | |
− | + | -- valid URL formats that are not base62 | |
− | + | -- http://www.webcitation.org/query?id=1138911916587475 | |
− | + | -- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03 | |
− | + | -- http://www.webcitation.org/1138911916587475 | |
− | + | -- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e | |
− | + | -- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e | |
− | + | if mw.ustring.find( dt[2], "query", 1, true) or | |
− | + | mw.ustring.find( dt[2], "cache", 1, true) or | |
− | + | mw.ustring.find( dt[2], "getfile", 1, true) or | |
− | + | tonumber(dt[2]) then | |
− | + | return "query" | |
− | + | end | |
− | + | dt = os.date('*t', string.format("%d", base62(dt[2])):sub(1,10)) -- base62 string -> exponential number -> text -> first 10 characters -> a table of date parts | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | if not is_valid_date (dt.year, dt.month, dt.day) then | |
− | + | return inlineRed(err_warn_msgs.date_err, 'error'); | |
− | + | end | |
− | + | ||
− | + | return makeDate(dt.year, dt.month, dt.day, df) or inlineRed (err_warn_msgs.date4, 'error'); | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
--[[--------------------------< decodeWaybackDate >----------------------- | --[[--------------------------< decodeWaybackDate >----------------------- | ||
Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com ) | Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com ) | ||
− | + | or Library of Congress Web Archives (/all/20160901010101/http://example.com) | |
− | + | return the formatted date eg. "September 1, 2016" in df format | |
− | + | Handle non-digits in snapshot ID such as "re_" and "-" and "*" | |
− | + | ]] | |
local function decodeWaybackDate(path, df) | local function decodeWaybackDate(path, df) | ||
− | + | local msg, snapdate; | |
+ | |||
+ | snapdate = path:gsub ('^/all/', ''):gsub ('^/web/', ''):gsub ('^/', ''); -- remove leading '/all/', leading '/web/' or leading '/' | ||
+ | snapdate = snapdate:match ('^[^/]+'); -- get timestamp | ||
+ | if snapdate == "*" then -- eg. /web/*/http.. or /all/*/http.. | ||
+ | return "index" | ||
+ | end | ||
+ | |||
+ | snapdate = snapdate:gsub ('%a%a_%d?$', ''):gsub ('%-', ''); -- from date, remove any trailing "re_", dashes | ||
+ | |||
+ | msg = ''; | ||
+ | if snapdate:match ('%*$') then -- a trailing '*' causes calendar display at archive .org | ||
+ | snapdate = snapdate:gsub ('%*$', ''); -- remove so not part of length calc later | ||
+ | -- msg = inlineRed(err_warn_msgs.ts_cal, 'warning'); -- TODO: enable this -- make a message | ||
+ | end | ||
+ | |||
+ | if not tonumber(snapdate) then | ||
+ | return inlineRed (err_warn_msgs.date2, 'error'); | ||
+ | end | ||
+ | local dlen = string.len(snapdate) | ||
+ | if dlen < 8 then -- we need 8 digits TODO: but shouldn't this be testing for 14 digits? | ||
+ | return inlineRed (err_warn_msgs.date3, 'error'); | ||
+ | end | ||
− | + | local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)'); -- no need for snapdatelong here | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | if not is_valid_date (year, month, day) then | |
− | + | return inlineRed(err_warn_msgs.date_err, 'error'); | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error'); | |
− | |||
− | |||
− | |||
− | |||
− | |||
+ | --[[ snapdate = makeDate(year, month, day, df); -- TODO: enable this | ||
+ | if snapdate then | ||
+ | if 14 == dlen then | ||
+ | return snapdate, msg; -- return date with message if any | ||
+ | else | ||
+ | return snapdate, msg .. inlineRed(err_warn_msgs.ts_len, 'warning'); -- return date with warning message(s) | ||
+ | end | ||
+ | else | ||
+ | return inlineRed (err_warn_msgs.date7, 'error'); -- return error message | ||
+ | end | ||
+ | ]] | ||
end | end | ||
+ | |||
--[[--------------------------< decodeArchiveisDate >----------------------- | --[[--------------------------< decodeArchiveisDate >----------------------- | ||
− | + | Given an Archive.is "long link" URI-path (e.g. /2016.08.28-144552/http://example.com) | |
− | + | return the date in df format (e.g. if df = dmy, return 28 August 2016) | |
− | + | Handles "." and "-" in snapshot date, so 2016.08.28-144552 is same as 20160828144552 | |
− | + | ]] | |
local function decodeArchiveisDate(path, df) | local function decodeArchiveisDate(path, df) | ||
+ | local snapdate | ||
+ | |||
+ | if path:match ('^/%w+$') then -- short form url path is '/' followed by some number of base 62 digits and nothing else | ||
+ | return "short link" -- e.g. http://archive.is/hD1qz | ||
+ | end | ||
− | + | snapdate = mw.text.split (path, '/')[2]:gsub('[%.%-]', ''); -- get snapshot date, e.g. 2016.08.28-144552; remove periods and hyphens | |
− | + | local dlen = string.len(snapdate) | |
− | + | if dlen < 8 then -- we need 8 digits TODO: but shouldn't this be testing for 14 digits? | |
− | + | return inlineRed (err_warn_msgs.date3, 'error'); | |
− | + | end | |
− | + | local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)'); -- no need for snapdatelong here | |
− | |||
− | |||
− | + | if not is_valid_date (year, month, day) then | |
− | + | return inlineRed(err_warn_msgs.date_err, 'error'); | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | ------ return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error'); | |
− | |||
− | |||
− | |||
− | |||
− | |||
+ | snapdate = makeDate(year, month, day, df); -- TODO: enable this | ||
+ | if snapdate then | ||
+ | if 14 == dlen then | ||
+ | return snapdate; -- return date | ||
+ | else | ||
+ | return snapdate, inlineRed(err_warn_msgs.ts_len, 'warning'); -- return date with warning message | ||
+ | end | ||
+ | else | ||
+ | return inlineRed (err_warn_msgs.date7, 'error'); -- return error message | ||
+ | end | ||
end | end | ||
− | --[[ | + | --[=[-------------------------< M A K E _ W I K I L I N K >---------------------------------------------------- |
+ | |||
+ | Makes a wikilink; when both link and display text is provided, returns a wikilink in the form [[L|D]]; if only | ||
+ | link is provided, returns a wikilink in the form [[L]]; if neither are provided or link is omitted, returns an | ||
+ | empty string. | ||
− | + | ]=] | |
− | + | local function make_wikilink (link, display, no_link) | |
+ | if nil == no_link then | ||
+ | if link and ('' ~= link) then | ||
+ | if display and ('' ~= display) then | ||
+ | return table.concat ({'[[', link, '|', display, ']]'}); | ||
+ | else | ||
+ | return table.concat ({'[[', link, ']]'}); | ||
+ | end | ||
+ | end | ||
+ | return display or ''; -- link not set so return the display text | ||
− | + | else -- no_link | |
+ | if display and ('' ~= display) then -- if there is display text | ||
+ | return display; -- return that | ||
+ | else | ||
+ | return link or ''; -- return the target article name or empty string | ||
+ | end | ||
+ | end | ||
+ | end | ||
− | |||
− | + | --[[--------------------------< serviceName >----------------------- | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | Given a domain extracted by mw.uri.new() (eg. web.archive.org) set tail string and service ID | |
− | |||
− | + | ]] | |
− | + | local function serviceName(host, no_link) | |
− | + | local tracking; | |
− | + | local index; | |
− | + | ||
− | + | host = host:lower():gsub ('^web%.(.+)', '%1'):gsub ('^www%.(.+)', '%1'); -- lowercase, remove web. and www. subdomains | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | if services[host] then | |
+ | index = host; | ||
+ | else | ||
+ | for k, _ in pairs (services) do | ||
+ | if host:find ('%f[%a]'..k:gsub ('([%.%-])', '%%%1')) then | ||
+ | index = k; | ||
+ | break; | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | |||
+ | if index then | ||
+ | local out = {''}; -- empty string in [1] so that concatenated result has leading single space | ||
+ | ulx.url1.service = services[index][4] or 'other'; | ||
+ | tracking = services[index][5] or categories.other; | ||
+ | -- build tail string | ||
+ | if false == services[index][1] then -- select prefix | ||
+ | table.insert (out, prefixes.at); | ||
+ | elseif true == services[index][1] then | ||
+ | table.insert (out, prefixes.atthe); | ||
+ | else | ||
+ | table.insert (out, services[index][1]); | ||
+ | end | ||
+ | |||
+ | table.insert (out, make_wikilink (services[index][2], services[index][3], no_link)); -- add article wikilink | ||
+ | if services[index][6] then -- add tail postfix if it exists | ||
+ | table.insert (out, services[index][6]); | ||
+ | end | ||
+ | |||
+ | ulx.url1.tail = table.concat (out, ' '); -- put it all together; result has leading space character | ||
+ | else -- here when unknown archive | ||
+ | ulx.url1.service = 'other'; | ||
+ | tracking = categories.unknown; | ||
+ | ulx.url1.tail = table.concat ({'', prefixes.at, host, inlineRed (err_warn_msgs.unknown_url)}, ' '); -- TODO: call to inlineRed() does not specify 'error' or 'warning'; should it? | ||
+ | end | ||
+ | |||
+ | track[tracking] = 1 | ||
end | end | ||
+ | |||
--[[--------------------------< parseExtraArgs >----------------------- | --[[--------------------------< parseExtraArgs >----------------------- | ||
− | + | Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10 | |
− | + | For example: {{webarchive |url=.. |url4=.. |url7=..}} | |
− | + | Three url arguments not in numeric sequence (1..4..7). | |
− | + | Function only processes arguments numbered 2 or greater (in this case 4 and 7) | |
− | + | It creates numeric sequenced table entries like: | |
− | + | urlx.url2.url = <argument value for url4> | |
− | + | urlx.url3.url = <argument value for url7> | |
− | + | Returns the number of URL arguments found numbered 2 or greater (in this case returns "2") | |
]] | ]] | ||
− | local function parseExtraArgs() | + | local function parseExtraArgs(args) |
− | + | local i, j, argurl, argurl2, argdate, argtitle | |
− | + | j = 2 | |
− | + | for i = 2, maxurls do | |
− | + | argurl = "url" .. i | |
− | + | if args[argurl] then | |
− | + | argurl2 = "url" .. j | |
− | + | ulx[argurl2] = {} | |
− | + | ulx[argurl2]["url"] = args[argurl] | |
− | + | argdate = "date" .. j | |
− | + | if args[argdate] then | |
− | + | ulx[argurl2]["date"] = args[argdate] | |
− | + | else | |
− | + | ulx[argurl2]["date"] = inlineRed (err_warn_msgs.date_miss, 'warning'); | |
− | + | end | |
− | + | ||
− | + | argtitle = "title" .. j | |
− | + | if args[argtitle] then | |
− | + | ulx[argurl2]["title"] = args[argtitle] | |
− | + | else | |
− | + | ulx[argurl2]["title"] = nil | |
− | + | end | |
− | + | j = j + 1 | |
− | + | end | |
+ | end | ||
− | + | if j == 2 then | |
− | + | return 0 | |
− | + | else | |
− | + | return j - 2 | |
− | + | end | |
+ | end | ||
− | |||
--[[--------------------------< comma >----------------------- | --[[--------------------------< comma >----------------------- | ||
− | + | Given a date string, return "," if it's MDY | |
− | + | ]] | |
local function comma(date) | local function comma(date) | ||
− | + | return (date and date:match ('%a+ +%d%d?(,) +%d%d%d%d')) or ''; | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
end | end | ||
+ | |||
--[[--------------------------< createTracking >----------------------- | --[[--------------------------< createTracking >----------------------- | ||
− | + | Return data in track[] ie. tracking categories | |
− | + | ]] | |
local function createTracking() | local function createTracking() | ||
+ | if not excepted_pages[this_page.fullText] then -- namespace:title/fragment is allowed to be categorized (typically this module's / template's testcases page(s)) | ||
+ | if uncategorized_namespaces[this_page.nsText] then -- TODO: enable this chunk | ||
+ | return ''; -- this page not to be categorized so return empty string | ||
+ | end | ||
+ | for _,v in ipairs (uncategorized_subpages) do -- cycle through page name patterns | ||
+ | if this_page.text:match (v) then -- test page name against each pattern | ||
+ | return ''; -- this subpage type not to be categorized so return empty string | ||
+ | end | ||
+ | end | ||
+ | end | ||
− | + | local out = {}; | |
− | + | if tableLength(track) > 0 then | |
− | + | for key, _ in pairs(track) do -- loop through table | |
− | + | table.insert (out, make_wikilink (key)); -- and convert category names to links | |
− | + | end | |
− | + | end | |
− | + | return table.concat (out); -- concat into one big string; empty string if table is empty | |
end | end | ||
+ | |||
--[[--------------------------< createRendering >----------------------- | --[[--------------------------< createRendering >----------------------- | ||
− | + | Return a rendering of the data in ulx[][] | |
+ | |||
+ | TODO: when archive date is '*' ('index') leading archive extlink should be [<url> Archive index] instead of | ||
+ | [<url> Archived] index; code to support this has been added but is commented out for the time being; look for TODO1 | ||
− | + | ]] | |
local function createRendering() | local function createRendering() | ||
− | + | local displayfield | |
+ | local out = {}; | ||
+ | |||
+ | local period1 = ''; -- For backwards compat with {{wayback}} | ||
+ | local period2 = '.'; | ||
+ | |||
+ | if 'none' == ulx.url1.format then -- For {{wayback}}, {{webcite}} | ||
+ | table.insert (out, '['); -- open extlink markup | ||
+ | table.insert (out, ulx.url1.url); -- add url | ||
+ | |||
+ | if ulx.url1.title then | ||
+ | table.insert (out, ' ') -- the required space | ||
+ | table.insert (out, ulx.url1.title) -- the title | ||
+ | table.insert (out, ']'); -- close extlink markup | ||
+ | table.insert (out, ulx.url1.tail); -- tail text | ||
+ | if ulx.url1.date then | ||
+ | table.insert (out, ' ('); -- open date text; TODO: why the html entity? | ||
+ | table.insert (out, 'index' == ulx.url1.date and 'archive' or 'archived'); -- add text | ||
+ | table.insert (out, ' '); -- insert a space | ||
+ | table.insert (out, ulx.url1.date); -- add date | ||
+ | table.insert (out, ')'); -- close date text | ||
+ | end | ||
+ | else -- no title | ||
+ | table.insert (out, ' Archived]') -- close extlink markup TODO1: remove this line | ||
+ | --TODO1 table.insert (out, 'index' == ulx.url1.date and ' Archive index]' or ' Archived]'); -- begin link label-- use this line for correct link label when date is 'index' | ||
+ | if ulx.url1.date then | ||
+ | if 'wayback' == ulx.url1.service then | ||
+ | period1 = '.'; | ||
+ | period2 = ''; | ||
+ | end | ||
+ | table.insert (out, table.concat ({' ', ulx.url1.date})); -- add date TODO1: remove this line | ||
+ | --[[TODO1 if 'index' ~= ulx.url1.date then -- TODO1: add this line -- use this if for correct link label when date is 'index' | ||
+ | table.insert (out, ulx.url1.date); -- add date TODO1: add this line -- use this if for correct link label when date is 'index' | ||
+ | end -- TODO1: add this line -- use this if for correct link label when date is 'index' | ||
+ | ]] table.insert (out, comma(ulx.url1.date)); -- add ',' if date format is mdy | ||
+ | table.insert (out, ulx.url1.tail); -- add tail text | ||
+ | table.insert (out, period1); -- terminate | ||
+ | else -- no date | ||
+ | table.insert (out, ulx.url1.tail); -- add tail text | ||
+ | end | ||
+ | end | ||
+ | |||
+ | if 0 < ulx.url1.extraurls then -- For multiple archive URLs | ||
+ | local tot = ulx.url1.extraurls + 1 | ||
+ | table.insert (out, period2); -- terminate first url | ||
+ | table.insert (out, ' Additional archives: '); -- add header text | ||
+ | |||
+ | for i=2, tot do -- loop through the additionals | ||
+ | local index = table.concat ({'url', i}); -- make an index | ||
+ | displayfield = ulx[index]['title'] and 'title' or 'date'; -- choose display text | ||
+ | table.insert (out, '['); -- open extlink markup | ||
+ | table.insert (out, ulx[index]['url']); -- add the url | ||
+ | table.insert (out, ' '); -- the required space | ||
+ | table.insert (out, ulx[index][displayfield]); -- add the label | ||
+ | table.insert (out, ']'); -- close extlink markup | ||
+ | table.insert (out, i==tot and '.' or ', '); -- add terminator | ||
+ | end | ||
+ | end | ||
+ | return table.concat (out); -- make a big string and done | ||
− | + | else -- For {{cite archives}} | |
− | + | if 'addlarchives' == ulx.url1.format then -- Multiple archive services | |
− | + | table.insert (out, 'Additional archives: '); -- add header text | |
− | + | else -- Multiple pages from the same archive | |
− | + | table.insert (out, 'Additional pages archived on '); -- add header text | |
− | + | table.insert (out, ulx.url1.date); -- add date to header text | |
− | + | table.insert (out, ': '); -- close header text | |
− | + | end | |
− | + | local tot = ulx.url1.extraurls + 1; | |
− | + | for i=1, tot do -- loop through the additionals | |
− | + | local index = table.concat ({'url', i}); -- make an index | |
− | + | table.insert (out, '['); -- open extlink markup | |
− | + | table.insert (out, ulx[index]['url']); -- add url | |
− | + | table.insert (out, ' '); -- add required space | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | displayfield = ulx[index]['title']; | |
− | + | if 'addlarchives' == ulx.url1.format then | |
− | + | if not displayfield then | |
− | + | displayfield = ulx[index]['date'] | |
− | + | end | |
− | + | else -- must be addlpages | |
− | + | if not displayfield then | |
− | + | displayfield = table.concat ({'Page ', i}); | |
− | + | end | |
− | + | end | |
− | + | table.insert (out, displayfield); -- add title, date, page label text | |
− | + | table.insert (out, ']'); -- close extlink markup | |
− | + | table.insert (out, (i==tot and '.' or ', ')); -- add terminator | |
− | + | end | |
− | + | return table.concat (out); -- make a big string and done | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
end | end | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | --[[--------------------------< W E B A R C H I V E >---------------------------------------------------------- | |
+ | |||
+ | template entry point | ||
− | + | TODO: deprecate empty |nolink= as a 'positive' assertion that archive service is not to be linked | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | ]] | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | local function webarchive(frame) | |
+ | local args = getArgs (frame, { -- TODO: delete this assignment | ||
+ | valueFunc = function (key, value) -- this code so that we can detect and handle the oddity that is |nolink= | ||
+ | if 'nolink' == key then -- |nolink= is 'set' when present with or without assigned value; TODO: deprecate this peculiar use | ||
+ | return value; -- don't trim; we don't care (right now) what the value is except when nil and we can't trim nil | ||
+ | elseif value then -- all other values: if the value is not nil | ||
+ | value = mw.text.trim (value); -- trim whitespace | ||
+ | if '' ~= value then -- empty string when value was only whitespace or was empty | ||
+ | return value; -- return non-nil, non-empty values | ||
+ | end | ||
+ | end | ||
+ | return nil; -- value was nil, empty, or contained only whitespace | ||
+ | end -- end of valueFunc | ||
+ | }); | ||
− | + | -- local args = getArgs (frame); -- TODO: replace the above with this | |
+ | |||
+ | local data = mw.loadData (table.concat ({ -- make a data module name; sandbox or live | ||
+ | 'Module:Webarchive/data', | ||
+ | frame:getTitle():find('sandbox', 1, true) and '/sandbox' or '' -- this instance is ./sandbox then append /sandbox | ||
+ | })); | ||
+ | categories = data.categories; -- fill in the forward declarations | ||
+ | err_warn_msgs = data.err_warn_msgs; | ||
+ | excepted_pages = data.excepted_pages; | ||
+ | prefixes = data.prefixes; | ||
+ | services = data.services; | ||
+ | uncategorized_namespaces = data.uncategorized_namespaces; | ||
+ | uncategorized_subpages = data.uncategorized_subpages; | ||
+ | |||
− | + | local date, format, msg, uri, url; | |
+ | |||
+ | verifydates = 'yes' == verifydates; -- convert to boolean | ||
− | + | if args.url and args.url1 then -- URL argument (first) | |
+ | return inlineError("url", "Conflicting |url= and |url1=.") .. createTracking(); | ||
+ | end | ||
+ | |||
+ | url = args.url or args.url1; | ||
+ | |||
+ | if not url then | ||
+ | return inlineError("url", "Empty.") .. createTracking() | ||
+ | end | ||
+ | if mw.ustring.find( url, "https://web.http", 1, true ) then -- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred | ||
+ | track[categories.error] = 1; | ||
+ | return inlineError("url", "https://web.http") .. createTracking() | ||
+ | end | ||
+ | if url == "https://web.archive.org/http:/" then -- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred | ||
+ | track[categories.error] = 1; | ||
+ | return inlineError("url", "Invalid URL") .. createTracking() | ||
+ | end | ||
− | + | ulx.url1 = {} | |
− | + | ulx.url1.url = url | |
− | + | if not (url:lower():find ('^http') or url:find ('^//')) then -- TODO: is this a good idea? isn't it better to simply throw an error when url is malformed ... | |
− | + | ulx.url1.url = 'http://' .. url -- ... rather than apply this 'fix' that might not fix anything? | |
− | + | end | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | ulx.url1.extraurls = parseExtraArgs(args) | |
− | + | uri = mw.uri.new (ulx.url1.url); -- get a table of uri parts from this url | |
− | + | serviceName(uri.host, args.nolink) | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | if args.date and args.date1 then -- Date argument | |
+ | return inlineError("date", "Conflicting |date= and |date1=.") .. createTracking(); | ||
+ | end | ||
+ | |||
+ | date = args.date or args.date1 | ||
− | + | if 'wayback' == ulx.url1.service or 'locwebarchives' == ulx.url1.service then | |
− | + | if '*' == date then -- TODO: why is this not compared to url date? | |
− | + | date = 'index'; | |
+ | end | ||
+ | if date then | ||
+ | if verifydates then | ||
+ | local ldf = dateFormat(date) | ||
+ | if ldf then | ||
+ | local udate, msg = decodeWaybackDate( uri.path, ldf ) -- get the url date in the same format as date in |date=; 'index' when wayback date is * | ||
+ | if udate ~= date then | ||
+ | date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or ''); -- mismatch us url date; add message if there is one | ||
+ | else | ||
+ | date = date .. (msg or ''); -- add message if there is one | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | else -- no |date= | ||
+ | date, msg = decodeWaybackDate( uri.path, "iso" ) | ||
+ | if not date then | ||
+ | date = inlineRed (err_warn_msgs.date1, 'error'); -- TODO: change this type of message so that it identifies url as source of error? | ||
+ | else | ||
+ | date = date .. (msg or ''); -- add message if there is one | ||
+ | end | ||
+ | end | ||
− | + | elseif 'webcite' == ulx.url1.service then | |
− | + | if date then | |
− | + | if verifydates then | |
− | + | local ldf = dateFormat(date) | |
− | + | if ldf then | |
+ | local udate = decodeWebciteDate( uri.path, ldf ) -- get the url date in the same format as date in |date= | ||
+ | if 'query' ~= udate then -- skip if query | ||
+ | if udate ~= date then | ||
+ | date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning'); | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | else | ||
+ | date = decodeWebciteDate( uri.path, "iso" ) | ||
+ | if date == "query" then | ||
+ | date = inlineRed (err_warn_msgs.date_miss, 'warning'); | ||
+ | elseif not date then | ||
+ | date = inlineRed (err_warn_msgs.date1, 'error'); | ||
+ | end | ||
+ | end | ||
− | + | elseif 'archiveis' == ulx.url1.service then | |
+ | if date then | ||
+ | if verifydates then | ||
+ | local ldf = dateFormat(date) | ||
+ | if ldf then | ||
+ | local udate, msg = decodeArchiveisDate( uri.path, ldf ) -- get the url date in the same format as date in |date= | ||
+ | if 'short link' ~= udate then -- skip if short link | ||
+ | if udate ~= date then | ||
+ | date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or ''); -- mismatch: use url date; add message if there is one | ||
+ | else | ||
+ | date = date .. (msg or ''); -- add message if there is one | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | else -- no |date= | ||
+ | date, msg = decodeArchiveisDate( uri.path, "iso" ) | ||
+ | if date == "short link" then | ||
+ | date = inlineRed (err_warn_msgs.date_miss, 'warning'); | ||
+ | elseif not date then | ||
+ | date = inlineRed (err_warn_msgs.date1, 'error'); | ||
+ | else | ||
+ | date = date .. (msg or ''); -- add message if there is one | ||
+ | end | ||
+ | end | ||
+ | |||
+ | else -- some other service | ||
+ | if not date then | ||
+ | date = inlineRed (err_warn_msgs.date_miss, 'warning'); | ||
+ | end | ||
+ | end | ||
+ | ulx.url1.date = date | ||
+ | |||
+ | format = args.format; -- Format argument | ||
+ | |||
+ | if not format then | ||
+ | format = "none" | ||
+ | else | ||
+ | if format == "addlpages" then | ||
+ | if not ulx.url1.date then | ||
+ | format = "none" | ||
+ | end | ||
+ | elseif format == "addlarchives" then | ||
+ | format = "addlarchives" | ||
+ | else | ||
+ | format = "none" | ||
+ | end | ||
+ | end | ||
+ | ulx.url1.format = format | ||
+ | |||
+ | if args.title and args.title1 then -- Title argument | ||
+ | return inlineError("title", "Conflicting |title= and |title1=.") .. createTracking(); | ||
+ | end | ||
+ | |||
+ | ulx.url1.title = args.title or args.title1; | ||
+ | |||
+ | local rend = createRendering() | ||
+ | if not rend then | ||
+ | rend = '<span style="font-size:100%" class="error citation-comment">Error in [[:Template:' .. tname .. ']]: Unknown problem. Please report on template talk page.</span>' | ||
+ | track[categories.error] = 1; | ||
+ | end | ||
+ | |||
+ | return rend .. createTracking() | ||
end | end | ||
− | return | + | |
+ | --[[--------------------------< E X P O R T E D F U N C T I O N S >------------------------------------------ | ||
+ | ]] | ||
+ | |||
+ | return {webarchive = webarchive}; |
Revision as of 15:04, 13 September 2018
--[[ ----------------------------------
Lua module implementing the Error in webarchive template: Check |url=
value. Empty. template.
A merger of the functionality of three templates: Template:Wayback, Template:Webcite and Template:Cite archives
]]
require('Module:No globals'); local getArgs = require ('Module:Arguments').getArgs; local this_page = mw.title.getCurrentTitle();
--[[--------------------------< F O R W A R D D E C L A R A T I O N S >--------------------------------------
]]
local categories = {}; -- category names from ./data local err_warn_msgs = {}; -- error and warning messages from ./data local excepted_pages = {}; local prefixes = {}; -- service provider tail string prefixes from ./data local services = {}; -- archive service provider data from ./data local uncategorized_namespaces = {}; -- list of namespaces that we should not categorize local uncategorized_subpages = {}; -- list of subpages that should not be categorized
local ulx = {}; -- Associative array to hold template data local track = {}; -- Associative array to hold tracking categories
--[[--------------------------< G L O B A L C O N F I G U R A T I O N S E T T I N G S >--------------------
]]
local maxurls = 10; -- Max number of URLs allowed. local tname = 'Webarchive' -- name of calling template. Change if template rename. local verifydates = 'yes' -- See documentation. Set "no" to disable.
--[[--------------------------< inlineError >-----------------------
Critical error. Render output completely in red. Add to tracking category.
]]
local function inlineError(arg, msg)
track[categories.error] = 1
return 'Error in webarchive template: Check |' .. arg .. '=
value. ' .. msg .. ''
end
--[[--------------------------< inlineRed >-----------------------
Render a text fragment in red, such as a warning as part of the final output. Add tracking category.
]]
local function inlineRed(msg, trackmsg) if trackmsg == "warning" then track[categories.warning] = 1; elseif trackmsg == "error" then track[categories.error] = 1; end
return '' .. msg .. '' end
--[[--------------------------< base62 >-----------------------
Convert base-62 to base-10 Credit: https://de.wikipedia.org/wiki/Modul:Expr
]]
local function base62( value ) local r = 1 -- default return value is input value is malformed
if value:match( "^%w+$" ) then -- value must only be in the set [0-9a-zA-Z] local n = #value -- number of characters in value local k = 1 local c r = 0 for i = n, 1, -1 do -- loop through all characters in value from ls digit to ms digit c = value:byte( i, i ) if c >= 48 and c <= 57 then -- character is digit 0-9 c = c - 48 elseif c >= 65 and c <= 90 then -- character is ascii a-z c = c - 55 else -- must be ascii A-Z c = c - 61 end r = r + c * k -- accumulate this base62 character's value k = k * 62 -- bump for next end -- for i end return r end
--[[--------------------------< tableLength >-----------------------
Given a 1-D table, return number of elements
]]
local function tableLength(T) local count = 0 for _ in pairs(T) do count = count + 1 end return count end
--[[--------------------------< dateFormat >-----------------------
Given a date string, return its format: dmy, mdy, iso, ymd If unable to determine return nil
]]
local function dateFormat(date)
local patterns = { ['iso'] = '(%d%d%d%d)%-%d%d%-%d%d', ['dmy'] = '%d%d? +%a+ +(%d%d%d%d)', ['mdy'] = '%a+ %d%d?, +(%d%d%d%d)', ['ymd'] = '(%d%d%d%d) +%a+ %d%d?', -- TODO: not mos compliant; delete? };
local form, y;
for k, v in pairs (patterns) do -- loop through the patterns table y = mw.ustring.match (date, v); -- looking for a match if y then -- not nil when found form = k; -- save that break; -- and done end end
return (y and (1900 < tonumber(y) and 2200 > tonumber(y))) and form; -- TODO: why 1900? shouldn't that be birth-of-intenet year? why 2200? shouldn't that be current year? end
--[[--------------------------< makeDate >-----------------------
Given a zero-padded 4-digit year, 2-digit month and 2-digit day, return a full date in df format df = mdy, dmy, iso, ymd
on entry, year, month, day are presumed to be correct for the date that they represent; all are required
]]
local function makeDate(year, month, day, df) local format = { ['dmy'] = 'j F Y', ['mdy'] = 'F j, Y', ['ymd'] = 'Y F j', ['iso'] = 'Y-m-d', };
if not year or == year or not month or == month or not day or == day and format[df] then return nil; end
local date = table.concat ({year, month, day}, '-'); -- assemble iso format date return mw.getContentLanguage():formatDate (format[df], date); end
--[[--------------------------< I S _ V A L I D _ D A T E >----------------------------------------------------
Returns true if date is after 31 December 1899 (why is 1900 the min year? shouldn't the internet's date-of-birth be min year?), not after today's date, and represents a valid date (29 February 2017 is not a valid date). Applies Gregorian leapyear rules.
all arguments are required
]]
local function is_valid_date (year, month, day) local days_in_month = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; local month_length; local y, m, d; local today = os.date ('*t'); -- fetch a table of current date parts
if not year or == year or not month or == month or not day or == day then return false; -- something missing end
y = tonumber (year); m = tonumber (month); d = tonumber (day);
if 1900 > y or today.year < y or 1 > m or 12 < m then -- year and month are within bounds TODO: 1900? return false; end
if (2==m) then -- if February month_length = 28; -- then 28 days unless if (0==(y%4) and (0~=(y%100) or 0==(y%400))) then -- is a leap year? month_length = 29; -- if leap year then 29 days in February end else month_length=days_in_month[m]; end
if 1 > d or month_length < d then -- day is within bounds return false; end -- here when date parts represent a valid date return os.time({['year']=y, ['month']=m, ['day']=d, ['hour']=0}) <= os.time(); -- date at midnight must be less than or equal to current date/time end
--[[--------------------------< decodeWebciteDate >-----------------------
Given a URI-path to Webcite (eg. /67xHmVFWP) return the encoded date in df format
]]
local function decodeWebciteDate(path, df)
local dt = {}
dt = mw.text.split(path, "/")
-- valid URL formats that are not base62
-- http://www.webcitation.org/query?id=1138911916587475 -- http://www.webcitation.org/query?url=http..&date=2012-06-01+21:40:03 -- http://www.webcitation.org/1138911916587475 -- http://www.webcitation.org/cache/73e53dd1f16cf8c5da298418d2a6e452870cf50e -- http://www.webcitation.org/getfile.php?fileid=1c46e791d68e89e12d0c2532cc3cf629b8bc8c8e
if mw.ustring.find( dt[2], "query", 1, true) or mw.ustring.find( dt[2], "cache", 1, true) or mw.ustring.find( dt[2], "getfile", 1, true) or tonumber(dt[2]) then return "query" end
dt = os.date('*t', string.format("%d", base62(dt[2])):sub(1,10)) -- base62 string -> exponential number -> text -> first 10 characters -> a table of date parts
if not is_valid_date (dt.year, dt.month, dt.day) then
return inlineRed(err_warn_msgs.date_err, 'error');
end
return makeDate(dt.year, dt.month, dt.day, df) or inlineRed (err_warn_msgs.date4, 'error'); end
--[[--------------------------< decodeWaybackDate >-----------------------
Given a URI-path to Wayback (eg. /web/20160901010101/http://example.com ) or Library of Congress Web Archives (/all/20160901010101/http://example.com) return the formatted date eg. "September 1, 2016" in df format Handle non-digits in snapshot ID such as "re_" and "-" and "*"
]]
local function decodeWaybackDate(path, df)
local msg, snapdate;
snapdate = path:gsub ('^/all/', ):gsub ('^/web/', ):gsub ('^/', ); -- remove leading '/all/', leading '/web/' or leading '/' snapdate = snapdate:match ('^[^/]+'); -- get timestamp if snapdate == "*" then -- eg. /web/*/http.. or /all/*/http.. return "index" end
snapdate = snapdate:gsub ('%a%a_%d?$', ):gsub ('%-', ); -- from date, remove any trailing "re_", dashes
msg = ; if snapdate:match ('%*$') then -- a trailing '*' causes calendar display at archive .org snapdate = snapdate:gsub ('%*$', ); -- remove so not part of length calc later -- msg = inlineRed(err_warn_msgs.ts_cal, 'warning'); -- TODO: enable this -- make a message end
if not tonumber(snapdate) then return inlineRed (err_warn_msgs.date2, 'error'); end local dlen = string.len(snapdate) if dlen < 8 then -- we need 8 digits TODO: but shouldn't this be testing for 14 digits? return inlineRed (err_warn_msgs.date3, 'error'); end
local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)'); -- no need for snapdatelong here
if not is_valid_date (year, month, day) then return inlineRed(err_warn_msgs.date_err, 'error'); end
return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error');
--[[ snapdate = makeDate(year, month, day, df); -- TODO: enable this if snapdate then if 14 == dlen then return snapdate, msg; -- return date with message if any else return snapdate, msg .. inlineRed(err_warn_msgs.ts_len, 'warning'); -- return date with warning message(s) end else return inlineRed (err_warn_msgs.date7, 'error'); -- return error message end ]] end
--[[--------------------------< decodeArchiveisDate >-----------------------
Given an Archive.is "long link" URI-path (e.g. /2016.08.28-144552/http://example.com) return the date in df format (e.g. if df = dmy, return 28 August 2016) Handles "." and "-" in snapshot date, so 2016.08.28-144552 is same as 20160828144552
]]
local function decodeArchiveisDate(path, df) local snapdate
if path:match ('^/%w+$') then -- short form url path is '/' followed by some number of base 62 digits and nothing else return "short link" -- e.g. http://archive.is/hD1qz end
snapdate = mw.text.split (path, '/')[2]:gsub('[%.%-]', ); -- get snapshot date, e.g. 2016.08.28-144552; remove periods and hyphens
local dlen = string.len(snapdate) if dlen < 8 then -- we need 8 digits TODO: but shouldn't this be testing for 14 digits? return inlineRed (err_warn_msgs.date3, 'error'); end
local year, month, day = snapdate:match ('(%d%d%d%d)(%d%d)(%d%d)'); -- no need for snapdatelong here
if not is_valid_date (year, month, day) then return inlineRed(err_warn_msgs.date_err, 'error'); end
return makeDate(year, month, day, df) or inlineRed (err_warn_msgs.date7, 'error');
snapdate = makeDate(year, month, day, df); -- TODO: enable this if snapdate then if 14 == dlen then return snapdate; -- return date else return snapdate, inlineRed(err_warn_msgs.ts_len, 'warning'); -- return date with warning message end else return inlineRed (err_warn_msgs.date7, 'error'); -- return error message end
end
--[=[-------------------------< M A K E _ W I K I L I N K >----------------------------------------------------
Makes a wikilink; when both link and display text is provided, returns a wikilink in the form D; if only link is provided, returns a wikilink in the form L; if neither are provided or link is omitted, returns an empty string.
]=]
local function make_wikilink (link, display, no_link) if nil == no_link then if link and ( ~= link) then if display and ( ~= display) then return table.concat ({'', display, ''}); else return table.concat ({'', link, ''}); end end return display or ; -- link not set so return the display text
else -- no_link if display and ( ~= display) then -- if there is display text return display; -- return that else return link or ; -- return the target article name or empty string end end end
--[[--------------------------< serviceName >-----------------------
Given a domain extracted by mw.uri.new() (eg. web.archive.org) set tail string and service ID
]]
local function serviceName(host, no_link) local tracking; local index;
host = host:lower():gsub ('^web%.(.+)', '%1'):gsub ('^www%.(.+)', '%1'); -- lowercase, remove web. and www. subdomains
if services[host] then index = host; else for k, _ in pairs (services) do if host:find ('%f[%a]'..k:gsub ('([%.%-])', '%%%1')) then index = k; break; end end end
if index then local out = {}; -- empty string in [1] so that concatenated result has leading single space ulx.url1.service = services[index][4] or 'other'; tracking = services[index][5] or categories.other; -- build tail string if false == services[index][1] then -- select prefix table.insert (out, prefixes.at); elseif true == services[index][1] then table.insert (out, prefixes.atthe); else table.insert (out, services[index][1]); end
table.insert (out, make_wikilink (services[index][2], services[index][3], no_link)); -- add article wikilink if services[index][6] then -- add tail postfix if it exists table.insert (out, services[index][6]); end
ulx.url1.tail = table.concat (out, ' '); -- put it all together; result has leading space character
else -- here when unknown archive ulx.url1.service = 'other'; tracking = categories.unknown; ulx.url1.tail = table.concat ({, prefixes.at, host, inlineRed (err_warn_msgs.unknown_url)}, ' '); -- TODO: call to inlineRed() does not specify 'error' or 'warning'; should it? end
track[tracking] = 1 end
--[[--------------------------< parseExtraArgs >-----------------------
Parse numbered arguments starting at 2, such as url2..url10, date2..date10, title2..title10 For example: Archived [Date missing] at .. [Error: unknown archive URL]. Additional archives: [.. [Date missing]], [.. [Date missing]]. Three url arguments not in numeric sequence (1..4..7). Function only processes arguments numbered 2 or greater (in this case 4 and 7) It creates numeric sequenced table entries like: urlx.url2.url = <argument value for url4> urlx.url3.url = <argument value for url7> Returns the number of URL arguments found numbered 2 or greater (in this case returns "2")
]]
local function parseExtraArgs(args)
local i, j, argurl, argurl2, argdate, argtitle
j = 2 for i = 2, maxurls do argurl = "url" .. i if args[argurl] then argurl2 = "url" .. j ulx[argurl2] = {} ulx[argurl2]["url"] = args[argurl] argdate = "date" .. j if args[argdate] then ulx[argurl2]["date"] = args[argdate] else ulx[argurl2]["date"] = inlineRed (err_warn_msgs.date_miss, 'warning'); end
argtitle = "title" .. j if args[argtitle] then ulx[argurl2]["title"] = args[argtitle] else ulx[argurl2]["title"] = nil end j = j + 1 end end
if j == 2 then return 0 else return j - 2 end end
--[[--------------------------< comma >-----------------------
Given a date string, return "," if it's MDY
]]
local function comma(date) return (date and date:match ('%a+ +%d%d?(,) +%d%d%d%d')) or ; end
--[[--------------------------< createTracking >-----------------------
Return data in track[] ie. tracking categories
]]
local function createTracking() if not excepted_pages[this_page.fullText] then -- namespace:title/fragment is allowed to be categorized (typically this module's / template's testcases page(s)) if uncategorized_namespaces[this_page.nsText] then -- TODO: enable this chunk return ; -- this page not to be categorized so return empty string end for _,v in ipairs (uncategorized_subpages) do -- cycle through page name patterns if this_page.text:match (v) then -- test page name against each pattern return ; -- this subpage type not to be categorized so return empty string end end end
local out = {}; if tableLength(track) > 0 then for key, _ in pairs(track) do -- loop through table table.insert (out, make_wikilink (key)); -- and convert category names to links end end return table.concat (out); -- concat into one big string; empty string if table is empty
end
--[[--------------------------< createRendering >-----------------------
Return a rendering of the data in ulx[][]
TODO: when archive date is '*' ('index') leading archive extlink should be [<url> Archive index] instead of [<url> Archived] index; code to support this has been added but is commented out for the time being; look for TODO1
]]
local function createRendering()
local displayfield local out = {};
local period1 = ; -- For backwards compat with Template:Wayback local period2 = '.';
if 'none' == ulx.url1.format then -- For Template:Wayback, Template:Webcite table.insert (out, '['); -- open extlink markup table.insert (out, ulx.url1.url); -- add url
if ulx.url1.title then table.insert (out, ' ') -- the required space table.insert (out, ulx.url1.title) -- the title table.insert (out, ']'); -- close extlink markup table.insert (out, ulx.url1.tail); -- tail text if ulx.url1.date then table.insert (out, ' ('); -- open date text; TODO: why the html entity? table.insert (out, 'index' == ulx.url1.date and 'archive' or 'archived'); -- add text table.insert (out, ' '); -- insert a space table.insert (out, ulx.url1.date); -- add date table.insert (out, ')'); -- close date text end else -- no title table.insert (out, ' Archived]') -- close extlink markup TODO1: remove this line --TODO1 table.insert (out, 'index' == ulx.url1.date and ' Archive index]' or ' Archived]'); -- begin link label-- use this line for correct link label when date is 'index' if ulx.url1.date then if 'wayback' == ulx.url1.service then period1 = '.'; period2 = ; end table.insert (out, table.concat ({' ', ulx.url1.date})); -- add date TODO1: remove this line --[[TODO1 if 'index' ~= ulx.url1.date then -- TODO1: add this line -- use this if for correct link label when date is 'index' table.insert (out, ulx.url1.date); -- add date TODO1: add this line -- use this if for correct link label when date is 'index' end -- TODO1: add this line -- use this if for correct link label when date is 'index' ]] table.insert (out, comma(ulx.url1.date)); -- add ',' if date format is mdy table.insert (out, ulx.url1.tail); -- add tail text table.insert (out, period1); -- terminate else -- no date table.insert (out, ulx.url1.tail); -- add tail text end end
if 0 < ulx.url1.extraurls then -- For multiple archive URLs local tot = ulx.url1.extraurls + 1 table.insert (out, period2); -- terminate first url table.insert (out, ' Additional archives: '); -- add header text
for i=2, tot do -- loop through the additionals local index = table.concat ({'url', i}); -- make an index displayfield = ulx[index]['title'] and 'title' or 'date'; -- choose display text table.insert (out, '['); -- open extlink markup table.insert (out, ulx[index]['url']); -- add the url table.insert (out, ' '); -- the required space table.insert (out, ulx[index][displayfield]); -- add the label table.insert (out, ']'); -- close extlink markup table.insert (out, i==tot and '.' or ', '); -- add terminator end end return table.concat (out); -- make a big string and done
else -- For Template:Cite archives if 'addlarchives' == ulx.url1.format then -- Multiple archive services table.insert (out, 'Additional archives: '); -- add header text else -- Multiple pages from the same archive table.insert (out, 'Additional pages archived on '); -- add header text table.insert (out, ulx.url1.date); -- add date to header text table.insert (out, ': '); -- close header text end
local tot = ulx.url1.extraurls + 1; for i=1, tot do -- loop through the additionals local index = table.concat ({'url', i}); -- make an index table.insert (out, '['); -- open extlink markup table.insert (out, ulx[index]['url']); -- add url table.insert (out, ' '); -- add required space
displayfield = ulx[index]['title']; if 'addlarchives' == ulx.url1.format then if not displayfield then displayfield = ulx[index]['date'] end else -- must be addlpages if not displayfield then displayfield = table.concat ({'Page ', i}); end end table.insert (out, displayfield); -- add title, date, page label text table.insert (out, ']'); -- close extlink markup table.insert (out, (i==tot and '.' or ', ')); -- add terminator end return table.concat (out); -- make a big string and done end end
--[[--------------------------< W E B A R C H I V E >----------------------------------------------------------
template entry point
TODO: deprecate empty |nolink= as a 'positive' assertion that archive service is not to be linked
]]
local function webarchive(frame) local args = getArgs (frame, { -- TODO: delete this assignment valueFunc = function (key, value) -- this code so that we can detect and handle the oddity that is |nolink= if 'nolink' == key then -- |nolink= is 'set' when present with or without assigned value; TODO: deprecate this peculiar use return value; -- don't trim; we don't care (right now) what the value is except when nil and we can't trim nil elseif value then -- all other values: if the value is not nil value = mw.text.trim (value); -- trim whitespace if ~= value then -- empty string when value was only whitespace or was empty return value; -- return non-nil, non-empty values end end return nil; -- value was nil, empty, or contained only whitespace end -- end of valueFunc });
-- local args = getArgs (frame); -- TODO: replace the above with this
local data = mw.loadData (table.concat ({ -- make a data module name; sandbox or live 'Module:Webarchive/data', frame:getTitle():find('sandbox', 1, true) and '/sandbox' or -- this instance is ./sandbox then append /sandbox })); categories = data.categories; -- fill in the forward declarations err_warn_msgs = data.err_warn_msgs; excepted_pages = data.excepted_pages; prefixes = data.prefixes; services = data.services; uncategorized_namespaces = data.uncategorized_namespaces; uncategorized_subpages = data.uncategorized_subpages;
local date, format, msg, uri, url;
verifydates = 'yes' == verifydates; -- convert to boolean
if args.url and args.url1 then -- URL argument (first) return inlineError("url", "Conflicting |url= and |url1=.") .. createTracking(); end
url = args.url or args.url1;
if not url then return inlineError("url", "Empty.") .. createTracking() end if mw.ustring.find( url, "https://web.http", 1, true ) then -- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred track[categories.error] = 1; return inlineError("url", "https://web.http") .. createTracking() end if url == "https://web.archive.org/http:/" then -- track bug - TODO: IAbot bug; not known if the bug has been fixed; deferred track[categories.error] = 1; return inlineError("url", "Invalid URL") .. createTracking() end
ulx.url1 = {} ulx.url1.url = url if not (url:lower():find ('^http') or url:find ('^//')) then -- TODO: is this a good idea? isn't it better to simply throw an error when url is malformed ... ulx.url1.url = 'http://' .. url -- ... rather than apply this 'fix' that might not fix anything? end
ulx.url1.extraurls = parseExtraArgs(args)
uri = mw.uri.new (ulx.url1.url); -- get a table of uri parts from this url serviceName(uri.host, args.nolink)
if args.date and args.date1 then -- Date argument return inlineError("date", "Conflicting |date= and |date1=.") .. createTracking(); end
date = args.date or args.date1
if 'wayback' == ulx.url1.service or 'locwebarchives' == ulx.url1.service then if '*' == date then -- TODO: why is this not compared to url date? date = 'index'; end if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate, msg = decodeWaybackDate( uri.path, ldf ) -- get the url date in the same format as date in |date=; 'index' when wayback date is * if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or ); -- mismatch us url date; add message if there is one else date = date .. (msg or ); -- add message if there is one end end end else -- no |date= date, msg = decodeWaybackDate( uri.path, "iso" ) if not date then date = inlineRed (err_warn_msgs.date1, 'error'); -- TODO: change this type of message so that it identifies url as source of error? else date = date .. (msg or ); -- add message if there is one end end
elseif 'webcite' == ulx.url1.service then if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate = decodeWebciteDate( uri.path, ldf ) -- get the url date in the same format as date in |date= if 'query' ~= udate then -- skip if query if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning'); end end end end else date = decodeWebciteDate( uri.path, "iso" ) if date == "query" then date = inlineRed (err_warn_msgs.date_miss, 'warning'); elseif not date then date = inlineRed (err_warn_msgs.date1, 'error'); end end
elseif 'archiveis' == ulx.url1.service then if date then if verifydates then local ldf = dateFormat(date) if ldf then local udate, msg = decodeArchiveisDate( uri.path, ldf ) -- get the url date in the same format as date in |date= if 'short link' ~= udate then -- skip if short link if udate ~= date then date = udate .. inlineRed (err_warn_msgs.mismatch, 'warning') .. (msg or ); -- mismatch: use url date; add message if there is one else date = date .. (msg or ); -- add message if there is one end end end end else -- no |date= date, msg = decodeArchiveisDate( uri.path, "iso" ) if date == "short link" then date = inlineRed (err_warn_msgs.date_miss, 'warning'); elseif not date then date = inlineRed (err_warn_msgs.date1, 'error'); else date = date .. (msg or ); -- add message if there is one end end
else -- some other service if not date then date = inlineRed (err_warn_msgs.date_miss, 'warning'); end end
ulx.url1.date = date
format = args.format; -- Format argument
if not format then format = "none" else if format == "addlpages" then if not ulx.url1.date then format = "none" end elseif format == "addlarchives" then format = "addlarchives" else format = "none" end end ulx.url1.format = format
if args.title and args.title1 then -- Title argument return inlineError("title", "Conflicting |title= and |title1=.") .. createTracking(); end
ulx.url1.title = args.title or args.title1;
local rend = createRendering() if not rend then rend = 'Error in Template:' .. tname .. ': Unknown problem. Please report on template talk page.' track[categories.error] = 1; end
return rend .. createTracking() end
--[[--------------------------< E X P O R T E D F U N C T I O N S >------------------------------------------
]]
return {webarchive = webarchive};