Abstract

Collaborative filtering is an important personalized method in recommender systems in internet commerce. It is infeasible that traditional collaborative filtering is based on absolute rating for items since users are difficult to accurately make an absolute rating for items, and also different users give different rating distribution. In this tutorial, it shows that how to use a Hama to calculate TCF.

Implementation

Build a user by item matrix, Set entries from the raw data

...

Get the pairs of all row key combinations w/o repetition

We don't have to recalculate the same value pair with reversed order.
ex) similar(UserA, UserB) == similar(UserB, UserA).

In this case, it is going to return 1, 2}, {1, 3}, {2, 3 by discarding 2, 1}, {3, 1}, {3, 2 from the full possible combination. Since there will be mC2 combinations (m : num keys), one can optimize it to have mC2 / N values per reducer (N : num-reducers). Something like :

partition(index i, key key_j, int N) { // N is num reducers
 // find the data per reducer
 int dataPerRed = mC2 / N; // assuming m is known
 int prev_sum = 0;
 // calculate the total combinations contributed by previous indexes
 for (k=1; k < i; k++) {
  prev_sum += m - k + 1; // this adds the number of combinations contributed by kth index
 }
 prev_sum += j - i + 1 // self contribution
 return prev_sum % dataPerRed
}

Calculate |a|·|b|cos(q) on Map/Reduce

...

Collect the similarity result of the two users

...

Pseudo code for TCF with Hama

import java.math.BigInteger;

import org.apache.hama.HamaConfiguration;
import org.apache.hama.Matrix;
import org.apache.hama.Vector;

public class TraditionalCF {
  public static double[][] data = { 
    { 2, 5, 1, 4 }, 
    { 4, 1, 3, 3 },
    { 3, 4, 2, 4 } 
  };

  public static void main(String[] args) {
    HamaConfiguration conf = new HamaConfiguration();

    // 1. Build a user by item matrix, Set entries from the raw data
    Matrix userByItem = new Matrix(conf, "userByItem");  
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 4; j++) {
        userByItem.set(i, j, data[i][j]);
      }
    }

    // 2. Get the pair set of all row key combinations
    Combination x = new Combination(data.length, 2);
    
    // 3. |a|·|b|cos(q) calculation
    while (x.hasMore()) {
      int[] pair = x.getNext();
      System.out.print("Similarity: (" + pair[0] + ", " + pair[1] + ") = ");

      Vector v1 = userByItem.getRow(pair[0]);
      Vector v2 = userByItem.getRow(pair[1]);
      double similarity = v1.dot(v2);

      // 4. Collect the similarity result of the two users
      System.out.println(similarity);
    }

    // Each part can be applied to large-scale job scheduling using Map/Reduce.
  }

  static class Combination {

    private int[] a;
    private int n;
    private int r;
    private BigInteger numLeft;
    private BigInteger total;

    public Combination(int n, int r) {
      if (r > n) {
        throw new IllegalArgumentException();
      }
      if (n < 1) {
        throw new IllegalArgumentException();
      }
      this.n = n;
      this.r = r;
      a = new int[r];
      BigInteger nFact = getFactorial(n);
      BigInteger rFact = getFactorial(r);
      BigInteger nminusrFact = getFactorial(n - r);
      total = nFact.divide(rFact.multiply(nminusrFact));
      reset();
    }

    public void reset() {
      for (int i = 0; i < a.length; i++) {
        a[i] = i;
      }
      numLeft = total;
    }

    public BigInteger getNumLeft() {
      return numLeft;
    }

    public boolean hasMore() {
      return numLeft.compareTo(BigInteger.ZERO) > 0;
    }

    public BigInteger getTotal() {
      return total;
    }

    private BigInteger getFactorial(int n) {
      BigInteger fact = BigInteger.ONE;
      for (int i = n; i > 1; i--) {
        fact = fact.multiply(BigInteger.valueOf(i));
      }
      return fact;
    }

    public int[] getNext() {
      if (numLeft.equals(total)) {
        numLeft = numLeft.subtract(BigInteger.ONE);
        return a;
      }

      int i = r - 1;
      while (a[i] == n - r + i) {
        i--;
      }
      a[i]++;
      for (int j = i + 1; j < r; j++) {
        a[j] = a[i] + j - i;
      }

      numLeft = numLeft.subtract(BigInteger.ONE);
      return a;
    }
  }
}