{
  "nbformat_minor": 0, 
  "nbformat": 4, 
  "cells": [
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "%matplotlib inline"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }, 
    {
      "source": [
        "\n# Roget\n\n\nBuild a directed graph of 1022 categories and\n5075 cross-references as defined in the 1879 version of Roget's Thesaurus\ncontained in the datafile roget_dat.txt. This example is described in\nSection 1.2 in Knuth's book (see [1]_ and [2]_).\n\nNote that one of the 5075 cross references is a self loop yet\nit is included in the graph built here because\nthe standard networkx `DiGraph` class allows self loops.\n(cf. 400pungency:400 401 403 405).\n\nReferences\n----------\n\n.. [1] Donald E. Knuth,\n   \"The Stanford GraphBase: A Platform for Combinatorial Computing\",\n   ACM Press, New York, 1993.\n.. [2] http://www-cs-faculty.stanford.edu/~knuth/sgb.html\n\n"
      ], 
      "cell_type": "markdown", 
      "metadata": {}
    }, 
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "from __future__ import print_function\n\n# Authors: Brendt Wohlberg, Aric Hagberg (hagberg@lanl.gov)\n# Date: 2005-04-01 07:56:22 -0700 (Fri, 01 Apr 2005)\n\n#    Copyright (C) 2004-2017 by\n#    Aric Hagberg <hagberg@lanl.gov>\n#    Dan Schult <dschult@colgate.edu>\n#    Pieter Swart <swart@lanl.gov>\n#    All rights reserved.\n#    BSD license.\n\nimport gzip\nimport re\nimport sys\n\nimport matplotlib.pyplot as plt\nfrom networkx import nx\n\ndef roget_graph():\n    \"\"\" Return the thesaurus graph from the roget.dat example in\n    the Stanford Graph Base.\n    \"\"\"\n    # open file roget_dat.txt.gz (or roget_dat.txt)\n    fh = gzip.open('roget_dat.txt.gz', 'r')\n\n    G = nx.DiGraph()\n\n    for line in fh.readlines():\n        line = line.decode()\n        if line.startswith(\"*\"):  # skip comments\n            continue\n        if line.startswith(\" \"):  # this is a continuation line, append\n            line = oldline + line\n        if line.endswith(\"\\\\\\n\"):  # continuation line, buffer, goto next\n            oldline = line.strip(\"\\\\\\n\")\n            continue\n\n        (headname, tails) = line.split(\":\")\n\n        # head\n        numfind = re.compile(\"^\\d+\")  # re to find the number of this word\n        head = numfind.findall(headname)[0]  # get the number\n\n        G.add_node(head)\n\n        for tail in tails.split():\n            if head == tail:\n                print(\"skipping self loop\", head, tail, file=sys.stderr)\n            G.add_edge(head, tail)\n\n    return G\n\n\nif __name__ == '__main__':\n    G = roget_graph()\n    print(\"Loaded roget_dat.txt containing 1022 categories.\")\n    print(\"digraph has %d nodes with %d edges\"\n          % (nx.number_of_nodes(G), nx.number_of_edges(G)))\n    UG = G.to_undirected()\n    print(nx.number_connected_components(UG), \"connected components\")\n\n    options = {\n        'node_color': 'black',\n        'node_size': 1,\n        'line_color': 'grey',\n        'linewidths': 0,\n        'width': 0.1,\n    }\n    nx.draw_circular(UG, **options)\n    plt.show()"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }
  ], 
  "metadata": {
    "kernelspec": {
      "display_name": "Python 2", 
      "name": "python2", 
      "language": "python"
    }, 
    "language_info": {
      "mimetype": "text/x-python", 
      "nbconvert_exporter": "python", 
      "name": "python", 
      "file_extension": ".py", 
      "version": "2.7.12", 
      "pygments_lexer": "ipython2", 
      "codemirror_mode": {
        "version": 2, 
        "name": "ipython"
      }
    }
  }
}