src/rss/reader-db.lisp

DOWNLOAD
(in-package :murja.rss.reader-db)

(defparameter unix-epoch
  (encode-universal-time 0 0 0 1 1 1970 0))

(defun universal-to-unix (universal-time)
  (- universal-time unix-epoch))

(defqueries "reader-fns")

(defun subscribe-to-feed2 (feed-name feed-url owner)
  (insert-feed feed-name feed-url (murja.model.user:user-id owner)))

(defun download (url modified-since)
  "Drakma decides to return either strings or array<byte>s based on random
(can't find the documentation about this logic) whims. This function performs
the http-request through drakma, checks which is returned, and in case of array<byte>s,
pipes it through trivial-utf-8:utf-8-bytes-to-string.

The second value returned is the last-modified response header as simpledate-timestamp"
  (multiple-value-bind (body status headers) (drakma:http-request url
								  :user-agent "drakma / murja-blog-engine rss updater"
								  :additional-headers (when modified-since
											(list (cons "If-Modified-Since" modified-since))))
    (let* ((last-modified-header (cdr (assoc :last-modified headers)))
	   (_ (log:info "last-modified-header: ~a~%" last-modified-header))
	   (last-modified (when last-modified-header
			    (lisp-fixup:if-modified-since->simpledate-timestamp
			     last-modified-header))))
      (log:info "last-modified: ~a~%" last-modified)

      (values
       (if (and (arrayp body)
		(not (stringp body)))
	   (trivial-utf-8:utf-8-bytes-to-string body)
	   body)
       last-modified
       status))))

;; (setf drakma:*header-stream* *standard-output*)
(defun get-child-item-value (name children)
  (some->>
    children
    (remove-if-not (lambda (node)
		     (string= (xmls:node-name node)
			      name)))
    first
    xmls:node-children
    first))

(defun parse-rss (feed-id feed-parsed)
  (log:info "Parsing rss")
  (let ((channel (first (xmls:node-children feed-parsed))))
    (dolist (item (remove-if-not (lambda (item)
				   (string= (xmls:node-name item) "item"))
				 (xmls:node-children channel)))
      (let ((title (or (get-child-item-value "title" (xmls:node-children item)) ""))
	    (link (get-child-item-value "link" (xmls:node-children item)))
	    (description (get-child-item-value "description" (xmls:node-children item)))
	    (author (or
		     (get-child-item-value "author" (xmls:node-children item))
		     ;; author seems to be optional value, let's get title from <channel> if missing
		     (get-child-item-value "title" (xmls:node-children channel))))
	    (pubDate (universal-to-unix
		      (parse-date-time (get-child-item-value "pubDate" (xmls:node-children item))))))
	(log:info "Parsed ~a as ~a"
		  (get-child-item-value "pubDate" (xmls:node-children item))
		  pubdate)
	(insert-feed-item title link description author pubDate feed-id)))))

(defun parse-atom (author feed-id feed-parsed)
  (log:info "parsing atom~%")
  (let ((entries (->>
		   feed-parsed
		   (xmls:node-children)
		   (remove-if-not (lambda (node)
				    (string= (xmls:node-name node) "entry"))))))
    (dolist (entry entries)
      (let ((title (or (get-child-item-value "title" (xmls:node-children entry)) ""))
	    (link (get-child-item-value "id" (xmls:node-children entry))) ;; atom calls hrefs 'id'
	    (description (get-child-item-value "content" (xmls:node-children entry)))
	    (pubDate (universal-to-unix
		      (parse-date-time (get-child-item-value "updated" (xmls:node-children entry))))))

	(insert-feed-item title link description author pubDate feed-id)))))

(defun update-feed (feed)
  (let* ((if-modified-since (gethash "last_modified" feed))
	 (if-modified-since (unless (equalp if-modified-since :null)
			      if-modified-since))
	 (url (gethash "url" feed))
	 (feed-id (gethash "id" feed)))
    (log:info "Updating feed ~a~%" url)
    (multiple-value-bind (feed-contents last-modified status) (download url if-modified-since)
      (if (equalp status 200)
	  (let* ((feed-parsed (xmls:parse feed-contents))
		 (feed-ns (xmls:node-ns feed-parsed)))

	    (log:info "did we get rss or atom?")
	    (if (and feed-ns
		     (cl-ppcre:all-matches-as-strings "Atom" feed-ns))
		(let ((author (first (xmls:node-children
				      (get-child-item-value "author" (xmls:node-children feed-parsed))))))
		  (parse-atom author feed-id feed-parsed))
		(parse-rss feed-id feed-parsed))
	    (when last-modified
	      (log:info "Updating ~a last-modified to ~a~%" feed-id last-modified)
	      (update-last-modified feed-id last-modified)
	      (log:info "Updated~%")))
	  (log:warn "Received status ~d" status)))))
      

(defun current-hour ()
  (multiple-value-bind (second minute hour) (decode-universal-time (get-universal-time))
    hour))

(defun current-minute ()
  (multiple-value-bind (second minute hour) (decode-universal-time (get-universal-time))
    hour))

(defun current-datetime ()
  (multiple-value-bind (second minute hour day month year) (decode-universal-time (get-universal-time))
    (format nil "~d-~2,'0d-~2,'0dT~2,'0d:~2,'0d:~2,'0dZ" year month day hour minute second)))

(defvar *last-updated* nil)
(defvar *updates* nil)

(defun update-feeds ()
  (when (or (not *last-updated*)
	    ;; hourly rate limit
	    (> (- (current-hour) *last-updated*) 1))
    (log:info "Updating all feeds~%")
    (dolist (feed (coerce (get-all-feeds) 'list))
      (update-feed feed))

    (setf *last-updated* (current-hour))
    (push (current-datetime) *updates*)))