;;; tagcamp.el --- code for my tagcamp presentation ;; Copyright (C) 2005 EVDB, Inc. ;; Author: Edward O'Connor ;; Keywords: convenience ;; This file is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; This file is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ;; Boston, MA 02110-1301, USA. ;;; Commentary: ;; This is not the most idiomatic Elisp code. ;;; Code: ;;; general lisp utilities (require 'cl) (require 'url) (require 'xml) ;; Web scraping (require 'scrape) ;; Web APIs (require 'evdb) (require 'technorati) ;;; link data management (defvar tc-link-file "~/web/html/2005/TagCamp/link-data") (defun add-link-site-tag-entry (link site tag) (or (progn (goto-char (point-min)) (re-search-forward (concat "^" (regexp-quote link) "[\t]" (regexp-quote site) "[\t]") nil t)) (progn (goto-char (point-min)) (re-search-forward (concat "^" (regexp-quote link) "[\t]") nil t)) (goto-char (point-max))) (goto-char (line-end-position)) (insert "\n") (insert (format "%s\t%s\t%s\t0" link site tag))) (defun goto-link-site-tag-entry (link site tag) (goto-char (point-min)) (or (re-search-forward (concat "^" (regexp-quote link) "[\t]" (regexp-quote site) "[\t]" (regexp-quote tag) "[\t]") nil t) (add-link-site-tag-entry link site tag))) (defun get-link-tag-count (link tag) (let ((count 0)) (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-char (point-min)) (while (re-search-forward (concat "^" (regexp-quote link) "[\t][^\t]+" ; skip over site "[\t]" (regexp-quote tag) "[\t]\\([0-9]+\\)") nil t) (setq count (+ count (string-to-number (match-string 1))))))) count)) (defun get-link-total-tag-count (link) (let ((count 0)) (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-char (point-min)) (while (re-search-forward (concat "^" (regexp-quote link) "[\t][^\t]+" ; skip over site "[\t][^\t]+" ; skip over tag "[\t]\\([0-9]+\\)") nil t) (setq count (+ count (string-to-number (match-string 1))))))) count)) (defun get-link-tags (link) (let (tags) (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-char (point-min)) (while (re-search-forward (concat "^" (regexp-quote link) "[\t][^\t]+" ; skip over site "[\t]\\([^\t]+\\)") nil t) (push (match-string 1) tags)))) (eshell-uniqify-list (sort tags 'string-lessp)))) (defun get-links () (let (links) (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-char (point-min)) (while (re-search-forward "^\\([^\n\t]+\\)"nil t) (push (match-string 1) links)))) (eshell-uniqify-list (sort links 'string-lessp)))) (defun raise-link-site-tag-count (link site tag delta) (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-link-site-tag-entry link site tag) (goto-char (line-beginning-position)) (looking-at (concat "^" (regexp-quote link) "[\t]" (regexp-quote site) "[\t]" (regexp-quote tag) "[\t]\\([0-9]+\\)$")) (let ((old (string-to-number (match-string 1))) new) (setq new (+ old delta)) (replace-match (number-to-string new) t t nil 1) new)))) ;; example ; (raise-link-site-tag-count "http://www.tagcamp.org/" "technorati" "tagcamp" 100) ;;; tag data management (defvar tc-tag-file "~/web/html/2005/TagCamp/tag-data") (defun add-tag-site-entry (tag site) (goto-char (point-min)) (or (re-search-forward (concat "^" (regexp-quote tag) "[\t]") nil t) (goto-char (point-max))) (goto-char (line-end-position)) (insert "\n") (insert (format "%s\t%s\t0" tag site))) (defun goto-tag-site-entry (tag site) (goto-char (point-min)) (or (re-search-forward (concat "^" (regexp-quote tag) "[\t]" (regexp-quote site) "[\t]") nil t) (add-tag-site-entry tag site))) (defun raise-tag-site-count (tag site delta) (subst-char-in-string ?\n ?- tag t) ;; do it (with-current-buffer (find-file-noselect tc-tag-file) (save-excursion (goto-tag-site-entry tag site) (goto-char (line-beginning-position)) (looking-at (concat "^" (regexp-quote tag) "[\t]" (regexp-quote site) "[\t]\\([0-9]+\\)$")) (let ((old (string-to-number (match-string 1))) new) (setq new (+ old delta)) (replace-match (number-to-string new) t t nil 1) new)))) ;; example ; (raise-tag-site-count "tagcamp" "delicious" 10) (defun set-tag-site-count (tag site count) (with-current-buffer (find-file-noselect tc-tag-file) (save-excursion (goto-tag-site-entry tag site) (goto-char (line-beginning-position)) (looking-at (concat "^" (regexp-quote tag) "[\t]" (regexp-quote site) "[\t]\\([0-9]+\\)$")) (replace-match (number-to-string count) t t nil 1) count))) (defun get-tag-site-count (tag site) (with-current-buffer (find-file-noselect tc-tag-file) (save-excursion (goto-tag-site-entry tag site) (goto-char (line-beginning-position)) (looking-at (concat "^" (regexp-quote tag) "[\t]" (regexp-quote site) "[\t]\\([0-9]+\\)$")) (string-to-number (match-string 1))))) (defun get-tag-total-count (tag) (let ((count 0)) (with-current-buffer (find-file-noselect tc-tag-file) (save-excursion (goto-char (point-min)) (while (re-search-forward (concat "^" (regexp-quote tag) "[\t][^\t]+" ; skip over site "[\t]\\([0-9]+\\)") nil t) (setq count (+ count (string-to-number (match-string 1))))))) count)) (defun get-total-tag-count () (let ((count 0)) (with-current-buffer (find-file-noselect tc-tag-file) (save-excursion (goto-char (point-min)) (while (re-search-forward (concat "^[^\t]+" ; skip over tag "[\t][^\t]+" ; skip over site "[\t]\\([0-9]+\\)") nil t) (setq count (+ count (string-to-number (match-string 1))))))) count)) (defun get-tag-technorati-count (tag) (let ((result (technorati-api/tag tag 1))) (setq result (plist-get result :result)) (+ (cadr (assoc :postsmatched result)) (cadr (assoc :blogsmatched result))))) (defun get-links-from-seid (seid) (let ((event (evdb-api/events/get seid)) links) (setq links (car (xml-get-children event 'links))) (setq links (xml-get-children links 'link)) (setq links (mapcar (lambda (link-node) (rest-api-join (xml-node-children (car (xml-get-children link-node 'url))))) links)) links)) (defun get-sites-from-link (link) (let* ((thingies (technorati-api/cosmos link 'links 100)) (items (plist-get thingies :items))) (remove "" (eshell-uniqify-list (sort (eshell-flatten-list (mapcar (lambda (item) (cdr (assoc :nearestpermalink item))) items)) 'string-equal))))) (defun extract-reltag-tags-under-node (node tags) "Return a list of the anchor element descendents of NODE. Specifically, those with rel='tag'." (mapc (lambda (child) (when (listp child) (if (and (eq (xml-node-name child) 'a) (string-equal "tag" (xml-get-attribute child 'rel))) (let ((tag (url-unhex-string (car (last (split-string (xml-get-attribute child 'href) "/")))))) (puthash tag (1+ (gethash tag tags 0)) tags)) (extract-reltag-tags-under-node child tags)))) (xml-node-children node))) (defun get-tags-from-site (site) (let ((tree (ignore-errors (scrape-url site))) (tags (make-hash-table :test 'equal))) (extract-reltag-tags-under-node tree tags) tags)) (defun delicious-url (url) (ignore-errors (scrape-url (concat "http://del.icio.us/url/" (md5 url))))) (defun extract-delicious-tags-under-node (node tags &optional parents) (let ((parents (cons (xml-node-name node) parents))) (mapc (lambda (child) (when (listp child) (if (and ;; is an element (eq (xml-node-name child) 'a) ;; href="/username/tag" (string-match "/[^/]+/[^/]+" (xml-get-attribute child 'href)) ;; in correct place in dom tree (equal parents '(div li ol div body html))) (let ((tag (car (xml-node-children child)))) (puthash tag (1+ (gethash tag tags 0)) tags)) (extract-delicious-tags-under-node child tags parents)))) (xml-node-children node)) tags)) (defun get-link-delicious-tags (link) (let ((tree (delicious-url link)) (tags (make-hash-table :test 'equal))) (extract-delicious-tags-under-node tree tags) tags)) (defun do-link-site (link site) (message "do-link-site %s %s" link site) (let ((tags (get-tags-from-site site))) (maphash (lambda (tag count) (raise-link-site-tag-count link site tag count)) tags))) (defun do-link (link) (message "do-link %s" link) (let ((sites (get-sites-from-link link)) (del-tags (get-link-delicious-tags link))) (mapc (lambda (site) (do-link-site link site)) sites) (maphash (lambda (tag count) (raise-tag-site-count tag "http://del.icio.us/" count) (raise-link-site-tag-count link "http://del.icio.us/" tag count)) del-tags))) (defun get-technorati-tag-counts () (with-current-buffer (find-file-noselect tc-link-file) (save-excursion (goto-char (point-min)) (while (re-search-forward "^\\([^\t]+\\)[\t]\\([^\t]+\\)[\t]\\([^\t]+\\)[\t]\\([0-9]+\\)$" nil t) (let ((link (match-string 1)) (site (match-string 2)) (tag (match-string 3)) (count (string-to-number (match-string 4)))) ;; (set-tag-site-count tag site count) (unless (> (get-tag-site-count tag "http://technorati.com/") 0) (set-tag-site-count tag "http://technorati.com/" (condition-case nil (get-tag-technorati-count tag) (error 0))))))))) (defun get-link-tagshare (link tag) (/ (float (get-link-tag-count link tag)) (+ (float (get-link-total-tag-count link)) 1))) (defvar tc-stop-words nil) (setq tc-stop-words '("conference" "technology" "music" "education" "architecture" "development" "tagging")) (defun rank-link-tags-by-fitness (link) (let ((tags (get-link-tags link)) tags-with-fitness) ;; filter idiosyncrasies (setq tags (remove-if (lambda (tag) (or (> 2 (get-link-tag-count link tag)) (> 2 (get-tag-total-count tag)) ;; kill "stop words" (string-match (regexp-opt tc-stop-words) tag))) tags)) (setq tags-with-fitness (mapcar (lambda (tag) (cons tag (/ (get-link-tagshare link tag) ;; avoid N/0. penalize big numbers. (+ (/ (float (get-tag-total-count tag)) (+ (float (get-total-tag-count)) 1)) 1)))) tags)) (setq tags-with-fitness (sort tags-with-fitness (lambda (c1 c2) (> (cdr c1) (cdr c2))))) (setq tags-with-fitness (remove-if (lambda (cons) (> 0.05 (cdr cons))) tags-with-fitness)) tags-with-fitness)) (provide 'tagcamp) ;;; tagcamp.el ends here